Skip to content

Commit

Permalink
Merge branch 'fabiofelix:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
fabiofelix authored Aug 20, 2024
2 parents 9b83187 + 57b46ee commit 7cdb45b
Show file tree
Hide file tree
Showing 20 changed files with 587 additions and 73 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,8 @@ python tools/test.py --cfg config/M3.yaml
2. Training/evaluation routines: `step_recog/iterators.py` (functions *train*, *evaluate*)
3. Model classes: `step_recog/models.py`
4. Dataloader: `step_recog/datasets/milly.py` (methods *_construct_loader* and *__getitem\__*)
- class *Milly_multifeature_v4* loads video frames and returns formated features
- class *Milly_multifeature_v5* loads (preprocessed) features and returns formated features
- class *Milly_multifeature_v4* loads video frames and returns features
- class *Milly_multifeature_v5* loads and returns (preprocessed) features
5. Image augmentation: `tools/augmentation.py` (function *get_augmentation*)
6. Basic configuration: `step_recog/config/defaults.py` (more important), `act_recog/config/defaults.py`, `auditory_slowfast/config/defaults.py`
6. Visualizer: `step_recog/full/visualize.py` implements a specific code that combines dataloading, model prediction, and a state machine. It uses the user interface with the trained models.
32 changes: 30 additions & 2 deletions act_recog/models/video_model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,19 @@
from torch.nn.init import normal_
from torch.utils import model_zoo
from copy import deepcopy
from PIL import Image
import pdb

from .build import MODEL_REGISTRY
from act_recog.datasets.transform import uniform_crop
from torchvision import transforms

def max_norm(x):
return x / x.max()

@MODEL_REGISTRY.register()
class Omnivore(nn.Module):
def __init__(self, cfg):
def __init__(self, cfg, resize = True):
super().__init__()

# model
Expand All @@ -32,15 +37,38 @@ def __init__(self, cfg):

self.heads = self.model.heads
self.model.heads = nn.Identity()
self.transform = transforms.Compose([
transforms.ToPILImage(),
transforms.Resize(self.cfg.MODEL.IN_SIZE),
transforms.CenterCrop(self.cfg.MODEL.IN_SIZE),
transforms.ToTensor(),
transforms.Lambda(max_norm),
transforms.Normalize(mean=self.cfg.MODEL.MEAN, std=self.cfg.MODEL.STD),
])

if not resize:
self.transform = transforms.Compose([
transforms.ToTensor(),
transforms.Lambda(max_norm),
transforms.Normalize(mean=self.cfg.MODEL.MEAN, std=self.cfg.MODEL.STD),
])

def forward(self, x, return_embedding=False): # C T H W
shoulder = self.model(x, input_type="video")
y = self.heads(shoulder)
if return_embedding:
return y, shoulder
return y

def prepare_image(self, im, bgr2rgb = True):
# 1,C,H,W
if isinstance(im, Image.Image):
im = np.array(im)

im = self.transform(im).float()
return im

def prepare_image_v2(self, im, bgr2rgb = True):
# 1,C,H,W
im = prepare_image(im, self.cfg.MODEL.MEAN, self.cfg.MODEL.STD, self.cfg.MODEL.IN_SIZE, bgr2rgb)
return im
Expand Down
365 changes: 365 additions & 0 deletions example/Perception_examples.ipynb

Large diffs are not rendered by default.

28 changes: 28 additions & 0 deletions example/config/M2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
_BASE_: STEPGRU_BASE.yaml
MODEL:
OMNIGRU_CHECKPOINT_URL: 'models/M2.pt'
OUTPUT_DIM: 8
YOLO_CHECKPOINT_URL: 'models/bbn_yolo_M2.pt'

DATASET:
TR_ANNOTATIONS_FILE: "labels/M2_Alabama+BBN_videos_M2-19.csv"
VL_ANNOTATIONS_FILE: "labels/M2_Alabama+BBN_videos_M2-19.csv"
TS_ANNOTATIONS_FILE: "labels/M2_Alabama+BBN_videos_M2-19.csv"

OUTPUT:
LOCATION: "output"

TRAIN:
ENABLE: False

SKILLS:
- NAME: M2 - Apply Tourniquet
STEPS:
- Place tourniquet over affected extremity 2-3 inches above wound site.
- Pull tourniquet tight.
- Apply strap to strap body.
- Turn windless clock wise or counter clockwise until hemorrhage is controlled.
- Lock windless into the windless keeper.
- Pull remaining strap over the windless keeper.
- Secure strap and windless keeper with keeper securing device.
- Mark time on securing device strap with permanent marker.
6 changes: 6 additions & 0 deletions example/config/OMNIVORE.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
MODEL:
ARCH: omnivore_swinB_epic
MODEL_NAME: Omnivore
NFRAMES: 32
MEAN: [0.485, 0.456, 0.406]
STD: [0.229, 0.224, 0.225]
68 changes: 68 additions & 0 deletions example/config/SLOWFAST_R50.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
TRAIN:
ENABLE: False
DATASET: epickitchens
BATCH_SIZE: 64
EVAL_PERIOD: 2
CHECKPOINT_PERIOD: 1
CHECKPOINT_EPOCH_RESET: True
AUTO_RESUME: True
CHECKPOINT_FILE_PATH: "/home/user/data/SLOWFAST-AUDITORY/SLOWFAST_EPIC.pyth"
DATA:
INPUT_CHANNEL_NUM: [1, 1]
AUDIO_DATA:
CLIP_SECS: 1.999
NUM_FRAMES: 400
SLOWFAST:
ALPHA: 4
BETA_INV: 8
FUSION_CONV_CHANNEL_RATIO: 2
FUSION_KERNEL_SZ: 7
RESNET:
ZERO_INIT_FINAL_BN: True
WIDTH_PER_GROUP: 64
NUM_GROUPS: 1
DEPTH: 50
TRANS_FUNC: bottleneck_transform
STRIDE_1X1: False
NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]]
FREQUENCY_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]]
FREQUENCY_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]]
BN:
USE_PRECISE_STATS: True
FREEZE: True
NUM_BATCHES_PRECISE: 200
SOLVER:
BASE_LR: 0.001
LR_POLICY: steps_with_relative_lrs
STEPS: [0, 20, 25]
LRS: [1, 0.1, 0.01]
MAX_EPOCH: 30
MOMENTUM: 0.9
WEIGHT_DECAY: 1e-4
WARMUP_EPOCHS: -1.0
WARMUP_START_LR: 0.01
OPTIMIZING_METHOD: sgd
MODEL:
NUM_CLASSES: [34, 34]
ARCH: slowfast
MODEL_NAME: SlowFast
LOSS_FUNC: cross_entropy
DROPOUT_RATE: 0.5
TEST:
ENABLE: False
DATASET: epickitchens
BATCH_SIZE: 32
NUM_ENSEMBLE_VIEWS: 1
DATA_LOADER:
NUM_WORKERS: 8
PIN_MEMORY: True
EPICKITCHENS:
TRAIN_PLUS_VAL: False
AUDIO_DATA_FILE: "/home/user/data/BBN/new/M1/sound/files/BBN-M1-audio-windows_with_epic-structure.hdf5"
ANNOTATIONS_DIR: "/home/user/data/BBN/new/M1/sound/files"
NUM_GPUS: 1
NUM_SHARDS: 1
RNG_SEED: 0
OUTPUT_DIR: "/home/user/data/BBN/new/M1/sound"
EXTRACT:
ENABLE: True
31 changes: 31 additions & 0 deletions example/config/STEPGRU_BASE.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
MODEL:
HIDDEN_SIZE: 1024
CONTEXT_LENGTH: 'full'
USE_ACTION: True ##default true
USE_OBJECTS: True ##default true
USE_AUDIO: False ##default false
USE_BN: False ##default false
DROP_OUT: 0.5

OMNIVORE_CONFIG: 'config/OMNIVORE.yaml'
SLOWFAST_CONFIG: 'config/SLOWFAST_R50.yaml'
DATASET:
NAME: 'Milly'
LOCATION: 'videos/frames'
AUDIO_LOCATION: '/sound'
INCLUDE_IMAGE_AUGMENTATIONS: True
INCLUDE_TIME_AUGMENTATIONS: False
IMAGE_AUGMENTATION_PERCENTAGE: 0.8
DATALOADER:
NUM_WORKERS: 12
PIN_MEMORY: True
TRAIN:
ENABLE: True
USE_CROSS_VALIDATION: True ##default true
USE_CLASS_WEIGHT: True ##default true
NUM_GPUS: 1
BATCH_SIZE: 8 #32
OPT: "adam" #adam sgd rmsprop
LR: 0.001
EPOCHS: 25
CV_TEST_TYPE: None # 10p bbn None
8 changes: 8 additions & 0 deletions example/labels/M2_Alabama+BBN_videos_M2-19.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
narration_id,participant_id,video_id,narration_timestamp,start_timestamp,stop_timestamp,start_frame,stop_frame,narration,verb,verb_class,noun,noun_class,all_nouns,all_noun_classes,video_fps
302,M2,M2-19,00:00:00.000,00:00:00.000,00:00:00.000,374,638,Place tourniquet with over effected extremity 2-3 inches above wound site.,Place tourniquet with over effected extremity 2-3 inches above wound site.,0,Place tourniquet with over effected extremity 2-3 inches above wound site.,0,['Place tourniquet with over effected extremity 2-3 inches above wound site.'],[0],30
303,M2,M2-19,00:00:00.000,00:00:00.000,00:00:00.000,677,785,Pull tourniquet tight.,Pull tourniquet tight.,1,Pull tourniquet tight.,1,['Pull tourniquet tight.'],[1],30
304,M2,M2-19,00:00:00.000,00:00:00.000,00:00:00.000,806,884,Cinch tourniquet strap.,Cinch tourniquet strap.,2,Cinch tourniquet strap.,2,['Cinch tourniquet strap.'],[2],30
305,M2,M2-19,00:00:00.000,00:00:00.000,00:00:00.000,896,1072,Turn windless clock wise or counter clockwise until hemorrhage is controlled .,Turn windless clock wise or counter clockwise until hemorrhage is controlled .,3,Turn windless clock wise or counter clockwise until hemorrhage is controlled .,3,['Turn windless clock wise or counter clockwise until hemorrhage is controlled .'],[3],30
306,M2,M2-19,00:00:00.000,00:00:00.000,00:00:00.000,1187,1247,Cinch tourniquet strap.,Cinch tourniquet strap.,2,Cinch tourniquet strap.,2,['Cinch tourniquet strap.'],[2],30
307,M2,M2-19,00:00:00.000,00:00:00.000,00:00:00.000,1254,1371,Lock windless into the windless keeper.,Lock windless into the windless keeper.,4,Lock windless into the windless keeper.,4,['Lock windless into the windless keeper.'],[4],30
308,M2,M2-19,00:00:00.000,00:00:00.000,00:00:00.000,1454,1503,Mark time on securing device strap with permanent marker.,Mark time on securing device strap with permanent marker.,7,Mark time on securing device strap with permanent marker.,7,['Mark time on securing device strap with permanent marker.'],[7],30
Empty file added example/models/.gitignore
Empty file.
Binary file added example/output/confusion_matrix.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
20 changes: 20 additions & 0 deletions example/output/metrics.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
precision recall f1-score support

Step 1 1.00 1.00 1.00 10
Step 2 0.38 1.00 0.56 5
Step 3 0.50 0.14 0.22 7
Step 4 0.75 0.86 0.80 7
Step 5 0.33 1.00 0.50 5
Step 6 0.00 0.00 0.00 0
Step 7 0.00 0.00 0.00 0
Step 8 0.60 1.00 0.75 3
No step 0.97 0.63 0.76 46

accuracy 0.71 83
macro avg 0.50 0.63 0.51 83
weighted avg 0.83 0.71 0.72 83


Categorical accuracy: 0.71
Weighted accuracy: 0.80
Balanced accuracy: 0.80
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file added example/videos/.gitignore
Empty file.
Empty file.
Empty file.
24 changes: 11 additions & 13 deletions step_recog/datasets/milly.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,8 @@ def __len__(self):
from ultralytics import YOLO
#from torch.quantization import quantize_dynamic

from torchvision import transforms

from step_recog.full.download import cached_download_file
from step_recog.full.clip_patches import ClipPatches

Expand Down Expand Up @@ -180,6 +182,10 @@ def __init__(self, cfg, split='train', filter=None):

self.augment_configs = {}
self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
self.transform = transforms.Compose([
transforms.Resize(self.omni_cfg.MODEL.IN_SIZE),
transforms.CenterCrop(self.omni_cfg.MODEL.IN_SIZE)
])

if self.cfg.MODEL.USE_OBJECTS:
yolo_checkpoint = cached_download_file(cfg.MODEL.YOLO_CHECKPOINT_URL)
Expand All @@ -192,7 +198,7 @@ def __init__(self, cfg, split='train', filter=None):
self.clip_patches.eval()

if self.cfg.MODEL.USE_ACTION:
self.omnivore = Omnivore(self.omni_cfg)
self.omnivore = Omnivore(self.omni_cfg, resize = False)
self.omnivore.eval()

self.sound_cache = deque(maxlen=5)
Expand Down Expand Up @@ -431,7 +437,7 @@ def _construct_loader(self, split):
video_windows = []
previous_stop_frame = 1

for _, step_ann in vid_ann.iterrows():
for idx, step_ann in vid_ann.iterrows():
win_size = self.rng.integers(len(win_size_sec))
hop_size = self.rng.integers(len(hop_size_perc))

Expand Down Expand Up @@ -537,15 +543,6 @@ def augment_frames(self, frames, frame_ids, video_id):

return frames

#Both CLIP and Omnivore resize to 224, 224
#With this code, Yolo is using the same size
def _resize_img(self, im, expected_size=224):
scale = max(expected_size/im.shape[0], expected_size/im.shape[1])
im = cv2.resize(im, (0,0), fx=scale, fy=scale)
im, _ = uniform_crop(im, expected_size, 1)

return im

def _get_sound_cache(self, video, path):
sound = None

Expand Down Expand Up @@ -587,9 +584,10 @@ def _load_frames(self, window):
## frame = cv2.imread(frame_path)
## frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = Image.open(frame_path)
frame = self.transform(frame)
frame = np.array(frame)
frame = self._resize_img(frame)
self.frame_cache[frame_id] = {"frame": frame, "new": True}

window_frames.append(frame)
window_frame_ids.append(frame_id)

Expand Down Expand Up @@ -628,7 +626,7 @@ def _extract_img_features(self, window_frames):

def _extract_act_features(self, window_frames):
frame_idx = np.linspace(0, len(window_frames) - 1, self.omni_cfg.MODEL.NFRAMES).astype('long')
X_omnivore = [ self.omnivore.prepare_image(frame, bgr2rgb = False) for frame in window_frames ]
X_omnivore = [ self.omnivore.prepare_image(frame) for frame in window_frames ]
X_omnivore = torch.stack(list(X_omnivore), dim=1)[None]
X_omnivore = X_omnivore[:, :, frame_idx, :, :]
_, Z_action = self.omnivore(X_omnivore.to(self.device), return_embedding=True)
Expand Down
7 changes: 6 additions & 1 deletion step_recog/full/clip_patches.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,17 @@ def stack_patches(self, patches):
for x in patches
])

def forward(self, image, xywh=None, patch_shape=None, include_frame=False):
def forward(self, image, xywh=None, patch_shape=None, include_frame=False):
if isinstance(image, Image.Image):
image = np.array(image)

patches = [] if xywh is None else extract_patches(image, xywh, patch_shape)

if include_frame:
patches.insert(0, image)
if not patches:
return torch.zeros((0, 512), device=self._device.device)

X = self.stack_patches(patches)
Z = self.model.encode_image(X)
return Z
Expand Down
16 changes: 8 additions & 8 deletions step_recog/full/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from ultralytics import YOLO
import ipdb
import cv2
from torchvision import transforms
from PIL import Image

from act_recog.models import Omnivore
from act_recog.config import load_config as act_load_config
Expand Down Expand Up @@ -45,12 +47,16 @@ def __init__(self, cfg_file, video_fps = 30):
for step in skill['STEPS']
])
self.MAX_OBJECTS = 25
self.transform = transforms.Compose([
transforms.Resize(self.omni_cfg.MODEL.IN_SIZE),
transforms.CenterCrop(self.omni_cfg.MODEL.IN_SIZE)
])

# build model
self.head = OmniGRU(self.cfg, load=True)
self.head.eval()
if self.cfg.MODEL.USE_ACTION:
self.omnivore = Omnivore(self.omni_cfg)
self.omnivore = Omnivore(self.omni_cfg, resize = False)
if self.cfg.MODEL.USE_OBJECTS:
yolo_checkpoint = cached_download_file(self.cfg.MODEL.YOLO_CHECKPOINT_URL)
self.yolo = YOLO(yolo_checkpoint)
Expand Down Expand Up @@ -80,13 +86,7 @@ def queue_frame(self, image):
self.omnivore_input_queue.append(X_omnivore)

def prepare(self, im):
expected_size=224
im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
scale = max(expected_size/im.shape[0], expected_size/im.shape[1])
im = cv2.resize(im, (0,0), fx=scale, fy=scale)
im, _ = uniform_crop(im, expected_size, 1)

return im
return self.transform(Image.fromarray(im))

def forward(self, image, queue_omni_frame = True):
# compute yolo
Expand Down
6 changes: 4 additions & 2 deletions tools/run_step_recog.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,10 @@ def my_train_test_split(cfg, videos):
videos, video_test = train_test_split(videos, test_size=0.10, random_state=2359) #M5 1030: only with BBN 041624.zip
elif "R18" in cfg.SKILLS[0]["NAME"]:
videos, video_test = train_test_split(videos, test_size=0.10, random_state=2343) #R18 1740: only with BBN seal_videos.zip
else: #A8, M4, R16, R19
videos, video_test = train_test_split(videos, test_size=0.10, random_state=1030)
elif "A8" in cfg.SKILLS[0]["NAME"]:
videos, video_test = train_test_split(videos, test_size=0.10, random_state=2329) #A8:
else: #M4, R16, R19
videos, video_test = train_test_split(videos, test_size=0.10, random_state=1030)

return videos, video_test

Expand Down
Loading

0 comments on commit 7cdb45b

Please sign in to comment.