Merge branch 'fabiofelix:main' into main

VIDA-NYU · Aug 20, 2024 · 7cdb45b · 7cdb45b
2 parents 9b83187 + 57b46ee
commit 7cdb45b
Show file tree

Hide file tree

Showing 20 changed files with 587 additions and 73 deletions.
diff --git a/README.md b/README.md
@@ -131,8 +131,8 @@ python tools/test.py --cfg config/M3.yaml
 2. Training/evaluation routines: `step_recog/iterators.py` (functions *train*, *evaluate*)
 3. Model classes: `step_recog/models.py`
 4. Dataloader: `step_recog/datasets/milly.py` (methods *_construct_loader* and *__getitem\__*)
-- class *Milly_multifeature_v4* loads video frames and returns formated features
-- class *Milly_multifeature_v5* loads (preprocessed) features and returns formated features 
+- class *Milly_multifeature_v4* loads video frames and returns features
+- class *Milly_multifeature_v5* loads and returns (preprocessed) features 
 5. Image augmentation: `tools/augmentation.py` (function *get_augmentation*)
 6. Basic configuration: `step_recog/config/defaults.py` (more important), `act_recog/config/defaults.py`, `auditory_slowfast/config/defaults.py`
 6. Visualizer: `step_recog/full/visualize.py` implements a specific code that combines dataloading, model prediction, and a state machine. It uses the user interface with the trained models.
diff --git a/act_recog/models/video_model_builder.py b/act_recog/models/video_model_builder.py
@@ -14,14 +14,19 @@
 from torch.nn.init import normal_
 from torch.utils import model_zoo
 from copy import deepcopy
+from PIL import Image
 import pdb
 
 from .build import MODEL_REGISTRY
 from act_recog.datasets.transform import uniform_crop
+from torchvision import transforms
+
+def max_norm(x):
+  return x / x.max()
 
 @MODEL_REGISTRY.register()
 class Omnivore(nn.Module):
-    def __init__(self, cfg):
+    def __init__(self, cfg, resize = True):
         super().__init__()
 
         # model
@@ -32,15 +37,38 @@ def __init__(self, cfg):
 
         self.heads = self.model.heads
         self.model.heads = nn.Identity()
+        self.transform = transforms.Compose([
+          transforms.ToPILImage(),
+          transforms.Resize(self.cfg.MODEL.IN_SIZE),
+          transforms.CenterCrop(self.cfg.MODEL.IN_SIZE),
+          transforms.ToTensor(),
+          transforms.Lambda(max_norm),
+          transforms.Normalize(mean=self.cfg.MODEL.MEAN, std=self.cfg.MODEL.STD),
+        ])
+
+        if not resize:
+          self.transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Lambda(max_norm),
+            transforms.Normalize(mean=self.cfg.MODEL.MEAN, std=self.cfg.MODEL.STD),
+          ])                  
 
     def forward(self, x, return_embedding=False):  # C T H W
         shoulder = self.model(x, input_type="video")
         y = self.heads(shoulder)
         if return_embedding:
             return y, shoulder
         return y
-    
+
     def prepare_image(self, im, bgr2rgb = True):
+        # 1,C,H,W
+        if isinstance(im, Image.Image):
+          im = np.array(im)
+
+        im = self.transform(im).float()
+        return im
+
+    def prepare_image_v2(self, im, bgr2rgb = True):
         # 1,C,H,W
         im = prepare_image(im, self.cfg.MODEL.MEAN, self.cfg.MODEL.STD, self.cfg.MODEL.IN_SIZE, bgr2rgb)
         return im    

diff --git a/example/Perception_examples.ipynb b/example/Perception_examples.ipynb
diff --git a/example/config/M2.yaml b/example/config/M2.yaml
@@ -0,0 +1,28 @@
+_BASE_: STEPGRU_BASE.yaml
+MODEL:
+  OMNIGRU_CHECKPOINT_URL: 'models/M2.pt'
+  OUTPUT_DIM: 8
+  YOLO_CHECKPOINT_URL: 'models/bbn_yolo_M2.pt'  
+
+DATASET:
+  TR_ANNOTATIONS_FILE: "labels/M2_Alabama+BBN_videos_M2-19.csv"
+  VL_ANNOTATIONS_FILE: "labels/M2_Alabama+BBN_videos_M2-19.csv"
+  TS_ANNOTATIONS_FILE: "labels/M2_Alabama+BBN_videos_M2-19.csv"
+
+OUTPUT:
+  LOCATION: "output"
+
+TRAIN:
+  ENABLE: False
+
+SKILLS:
+  - NAME: M2 - Apply Tourniquet
+    STEPS:
+    - Place tourniquet over affected extremity 2-3 inches above wound site.
+    - Pull tourniquet tight.
+    - Apply strap to strap body.
+    - Turn windless clock wise or counter clockwise until hemorrhage is controlled.
+    - Lock windless into the windless keeper.
+    - Pull remaining strap over the windless keeper.
+    - Secure strap and windless keeper with keeper securing device.
+    - Mark time on securing device strap with permanent marker.    
diff --git a/example/config/OMNIVORE.yaml b/example/config/OMNIVORE.yaml
@@ -0,0 +1,6 @@
+MODEL:
+  ARCH: omnivore_swinB_epic
+  MODEL_NAME: Omnivore
+  NFRAMES: 32
+  MEAN: [0.485, 0.456, 0.406]
+  STD: [0.229, 0.224, 0.225]
diff --git a/example/config/SLOWFAST_R50.yaml b/example/config/SLOWFAST_R50.yaml
@@ -0,0 +1,68 @@
+TRAIN:
+  ENABLE: False
+  DATASET: epickitchens
+  BATCH_SIZE: 64
+  EVAL_PERIOD: 2
+  CHECKPOINT_PERIOD: 1
+  CHECKPOINT_EPOCH_RESET: True
+  AUTO_RESUME: True
+  CHECKPOINT_FILE_PATH: "/home/user/data/SLOWFAST-AUDITORY/SLOWFAST_EPIC.pyth" 
+DATA:
+  INPUT_CHANNEL_NUM: [1, 1]
+AUDIO_DATA:
+  CLIP_SECS: 1.999
+  NUM_FRAMES: 400
+SLOWFAST:
+  ALPHA: 4
+  BETA_INV: 8
+  FUSION_CONV_CHANNEL_RATIO: 2
+  FUSION_KERNEL_SZ: 7
+RESNET:
+  ZERO_INIT_FINAL_BN: True
+  WIDTH_PER_GROUP: 64
+  NUM_GROUPS: 1
+  DEPTH: 50
+  TRANS_FUNC: bottleneck_transform
+  STRIDE_1X1: False
+  NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]]
+  FREQUENCY_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]]
+  FREQUENCY_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]]
+BN:
+  USE_PRECISE_STATS: True
+  FREEZE: True
+  NUM_BATCHES_PRECISE: 200
+SOLVER:
+  BASE_LR: 0.001
+  LR_POLICY: steps_with_relative_lrs
+  STEPS: [0, 20, 25]
+  LRS: [1, 0.1, 0.01]
+  MAX_EPOCH: 30
+  MOMENTUM: 0.9
+  WEIGHT_DECAY: 1e-4
+  WARMUP_EPOCHS: -1.0
+  WARMUP_START_LR: 0.01
+  OPTIMIZING_METHOD: sgd
+MODEL:
+  NUM_CLASSES: [34, 34]
+  ARCH: slowfast
+  MODEL_NAME: SlowFast
+  LOSS_FUNC: cross_entropy
+  DROPOUT_RATE: 0.5
+TEST:
+  ENABLE: False
+  DATASET: epickitchens
+  BATCH_SIZE: 32
+  NUM_ENSEMBLE_VIEWS: 1
+DATA_LOADER:
+  NUM_WORKERS: 8
+  PIN_MEMORY: True
+EPICKITCHENS:
+  TRAIN_PLUS_VAL: False
+  AUDIO_DATA_FILE: "/home/user/data/BBN/new/M1/sound/files/BBN-M1-audio-windows_with_epic-structure.hdf5"
+  ANNOTATIONS_DIR: "/home/user/data/BBN/new/M1/sound/files"
+NUM_GPUS: 1
+NUM_SHARDS: 1
+RNG_SEED: 0
+OUTPUT_DIR: "/home/user/data/BBN/new/M1/sound"
+EXTRACT:
+  ENABLE: True       
diff --git a/example/config/STEPGRU_BASE.yaml b/example/config/STEPGRU_BASE.yaml
@@ -0,0 +1,31 @@
+MODEL:
+  HIDDEN_SIZE: 1024
+  CONTEXT_LENGTH: 'full'
+  USE_ACTION: True  ##default true
+  USE_OBJECTS: True  ##default true
+  USE_AUDIO: False  ##default false
+  USE_BN: False      ##default false
+  DROP_OUT: 0.5
+
+  OMNIVORE_CONFIG: 'config/OMNIVORE.yaml'
+  SLOWFAST_CONFIG: 'config/SLOWFAST_R50.yaml'
+DATASET:
+  NAME: 'Milly'
+  LOCATION: 'videos/frames' 
+  AUDIO_LOCATION: '/sound'
+  INCLUDE_IMAGE_AUGMENTATIONS: True
+  INCLUDE_TIME_AUGMENTATIONS: False
+  IMAGE_AUGMENTATION_PERCENTAGE: 0.8
+DATALOADER:
+  NUM_WORKERS: 12
+  PIN_MEMORY: True
+TRAIN:
+  ENABLE: True
+  USE_CROSS_VALIDATION: True ##default true
+  USE_CLASS_WEIGHT: True ##default true
+  NUM_GPUS: 1
+  BATCH_SIZE: 8 #32
+  OPT: "adam"  #adam sgd rmsprop
+  LR: 0.001
+  EPOCHS: 25
+  CV_TEST_TYPE: None  # 10p bbn None
diff --git a/example/labels/M2_Alabama+BBN_videos_M2-19.csv b/example/labels/M2_Alabama+BBN_videos_M2-19.csv
@@ -0,0 +1,8 @@
+narration_id,participant_id,video_id,narration_timestamp,start_timestamp,stop_timestamp,start_frame,stop_frame,narration,verb,verb_class,noun,noun_class,all_nouns,all_noun_classes,video_fps
+302,M2,M2-19,00:00:00.000,00:00:00.000,00:00:00.000,374,638,Place tourniquet with over  effected extremity 2-3 inches above wound site.,Place tourniquet with over  effected extremity 2-3 inches above wound site.,0,Place tourniquet with over  effected extremity 2-3 inches above wound site.,0,['Place tourniquet with over  effected extremity 2-3 inches above wound site.'],[0],30
+303,M2,M2-19,00:00:00.000,00:00:00.000,00:00:00.000,677,785,Pull tourniquet tight.,Pull tourniquet tight.,1,Pull tourniquet tight.,1,['Pull tourniquet tight.'],[1],30
+304,M2,M2-19,00:00:00.000,00:00:00.000,00:00:00.000,806,884,Cinch tourniquet strap.,Cinch tourniquet strap.,2,Cinch tourniquet strap.,2,['Cinch tourniquet strap.'],[2],30
+305,M2,M2-19,00:00:00.000,00:00:00.000,00:00:00.000,896,1072,Turn windless clock wise or counter clockwise until hemorrhage is controlled .,Turn windless clock wise or counter clockwise until hemorrhage is controlled .,3,Turn windless clock wise or counter clockwise until hemorrhage is controlled .,3,['Turn windless clock wise or counter clockwise until hemorrhage is controlled .'],[3],30
+306,M2,M2-19,00:00:00.000,00:00:00.000,00:00:00.000,1187,1247,Cinch tourniquet strap.,Cinch tourniquet strap.,2,Cinch tourniquet strap.,2,['Cinch tourniquet strap.'],[2],30
+307,M2,M2-19,00:00:00.000,00:00:00.000,00:00:00.000,1254,1371,Lock windless into the windless keeper.,Lock windless into the windless keeper.,4,Lock windless into the windless keeper.,4,['Lock windless into the windless keeper.'],[4],30
+308,M2,M2-19,00:00:00.000,00:00:00.000,00:00:00.000,1454,1503,Mark time on securing device strap with permanent marker.,Mark time on securing device strap with permanent marker.,7,Mark time on securing device strap with permanent marker.,7,['Mark time on securing device strap with permanent marker.'],[7],30
diff --git a/example/models/.gitignore b/example/models/.gitignore
diff --git a/example/output/confusion_matrix.png b/example/output/confusion_matrix.png
diff --git a/example/output/metrics.txt b/example/output/metrics.txt
@@ -0,0 +1,20 @@
+              precision    recall  f1-score   support
+
+      Step 1       1.00      1.00      1.00        10
+      Step 2       0.38      1.00      0.56         5
+      Step 3       0.50      0.14      0.22         7
+      Step 4       0.75      0.86      0.80         7
+      Step 5       0.33      1.00      0.50         5
+      Step 6       0.00      0.00      0.00         0
+      Step 7       0.00      0.00      0.00         0
+      Step 8       0.60      1.00      0.75         3
+     No step       0.97      0.63      0.76        46
+
+    accuracy                           0.71        83
+   macro avg       0.50      0.63      0.51        83
+weighted avg       0.83      0.71      0.72        83
+
+
+Categorical accuracy: 0.71
+Weighted accuracy: 0.80
+Balanced accuracy: 0.80
diff --git a/example/output/video_evaluation/M2-19-step_variation.png b/example/output/video_evaluation/M2-19-step_variation.png
diff --git a/example/videos/.gitignore b/example/videos/.gitignore
diff --git a/example/videos/frames/.gitignore b/example/videos/frames/.gitignore
diff --git a/example/videos/frames/M2-19/.gitignore b/example/videos/frames/M2-19/.gitignore
diff --git a/step_recog/datasets/milly.py b/step_recog/datasets/milly.py
@@ -140,6 +140,8 @@ def __len__(self):
 from ultralytics import YOLO
 #from torch.quantization import quantize_dynamic
 
+from torchvision import transforms
+
 from step_recog.full.download import cached_download_file
 from step_recog.full.clip_patches import ClipPatches 
 
@@ -180,6 +182,10 @@ def __init__(self, cfg, split='train', filter=None):
 
     self.augment_configs = {}
     self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    self.transform = transforms.Compose([
+      transforms.Resize(self.omni_cfg.MODEL.IN_SIZE),
+      transforms.CenterCrop(self.omni_cfg.MODEL.IN_SIZE)
+    ])    
 
     if self.cfg.MODEL.USE_OBJECTS:
       yolo_checkpoint = cached_download_file(cfg.MODEL.YOLO_CHECKPOINT_URL)
@@ -192,7 +198,7 @@ def __init__(self, cfg, split='train', filter=None):
       self.clip_patches.eval()
 
     if self.cfg.MODEL.USE_ACTION:
-      self.omnivore = Omnivore(self.omni_cfg)
+      self.omnivore = Omnivore(self.omni_cfg, resize = False)
       self.omnivore.eval()
 
     self.sound_cache = deque(maxlen=5)
@@ -431,7 +437,7 @@ def _construct_loader(self, split):
       video_windows = []
       previous_stop_frame = 1
 
-      for _, step_ann in vid_ann.iterrows():
+      for idx, step_ann in vid_ann.iterrows():
         win_size = self.rng.integers(len(win_size_sec))
         hop_size = self.rng.integers(len(hop_size_perc))  
 
@@ -537,15 +543,6 @@ def augment_frames(self, frames, frame_ids, video_id):
 
     return frames
 
-  #Both CLIP and Omnivore resize to 224, 224
-  #With this code, Yolo is using the same size
-  def _resize_img(self, im, expected_size=224):
-    scale = max(expected_size/im.shape[0], expected_size/im.shape[1])
-    im    = cv2.resize(im, (0,0), fx=scale, fy=scale)
-    im, _ = uniform_crop(im, expected_size, 1)
-
-    return im
-
   def _get_sound_cache(self, video, path):
     sound = None
 
@@ -587,9 +584,10 @@ def _load_frames(self, window):
 ##          frame = cv2.imread(frame_path)
 ##          frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
           frame = Image.open(frame_path)
+          frame = self.transform(frame)
           frame = np.array(frame)
-          frame = self._resize_img(frame)
           self.frame_cache[frame_id] = {"frame": frame, "new": True}
+
         window_frames.append(frame)
         window_frame_ids.append(frame_id)
 
@@ -628,7 +626,7 @@ def _extract_img_features(self, window_frames):
 
   def _extract_act_features(self, window_frames):
     frame_idx  = np.linspace(0, len(window_frames) - 1, self.omni_cfg.MODEL.NFRAMES).astype('long')
-    X_omnivore = [ self.omnivore.prepare_image(frame, bgr2rgb = False) for frame in  window_frames ]
+    X_omnivore = [ self.omnivore.prepare_image(frame) for frame in  window_frames ]
     X_omnivore = torch.stack(list(X_omnivore), dim=1)[None]
     X_omnivore = X_omnivore[:, :, frame_idx, :, :]
     _, Z_action = self.omnivore(X_omnivore.to(self.device), return_embedding=True)

diff --git a/step_recog/full/clip_patches.py b/step_recog/full/clip_patches.py
@@ -22,12 +22,17 @@ def stack_patches(self, patches):
             for x in patches
         ])
 
-    def forward(self, image, xywh=None, patch_shape=None, include_frame=False):
+    def forward(self, image, xywh=None, patch_shape=None, include_frame=False):        
+        if isinstance(image, Image.Image):
+          image = np.array(image)
+
         patches = [] if xywh is None else extract_patches(image, xywh, patch_shape)
+
         if include_frame:
             patches.insert(0, image)
         if not patches: 
             return torch.zeros((0, 512), device=self._device.device)
+
         X = self.stack_patches(patches)
         Z = self.model.encode_image(X)
         return Z

diff --git a/step_recog/full/model.py b/step_recog/full/model.py
@@ -5,6 +5,8 @@
 from ultralytics import YOLO
 import ipdb
 import cv2
+from torchvision import transforms
+from PIL import Image
 
 from act_recog.models import Omnivore
 from act_recog.config import load_config as act_load_config
@@ -45,12 +47,16 @@ def __init__(self, cfg_file, video_fps = 30):
             for step in skill['STEPS']
         ])
         self.MAX_OBJECTS = 25
+        self.transform = transforms.Compose([
+          transforms.Resize(self.omni_cfg.MODEL.IN_SIZE),
+          transforms.CenterCrop(self.omni_cfg.MODEL.IN_SIZE)
+        ])            
 
         # build model
         self.head = OmniGRU(self.cfg, load=True)
         self.head.eval()
         if self.cfg.MODEL.USE_ACTION:
-            self.omnivore = Omnivore(self.omni_cfg)
+            self.omnivore = Omnivore(self.omni_cfg, resize = False)
         if self.cfg.MODEL.USE_OBJECTS:
             yolo_checkpoint = cached_download_file(self.cfg.MODEL.YOLO_CHECKPOINT_URL)
             self.yolo = YOLO(yolo_checkpoint)
@@ -80,13 +86,7 @@ def queue_frame(self, image):
         self.omnivore_input_queue.append(X_omnivore) 
 
     def prepare(self, im):
-      expected_size=224
-      im    = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)        
-      scale = max(expected_size/im.shape[0], expected_size/im.shape[1])
-      im    = cv2.resize(im, (0,0), fx=scale, fy=scale)
-      im, _ = uniform_crop(im, expected_size, 1)
-
-      return im      
+      return self.transform(Image.fromarray(im))
 
     def forward(self, image, queue_omni_frame = True):
         # compute yolo

diff --git a/tools/run_step_recog.py b/tools/run_step_recog.py
@@ -102,8 +102,10 @@ def my_train_test_split(cfg, videos):
       videos, video_test = train_test_split(videos, test_size=0.10, random_state=2359) #M5  1030: only with BBN 041624.zip
     elif "R18" in cfg.SKILLS[0]["NAME"]:      
       videos, video_test = train_test_split(videos, test_size=0.10, random_state=2343) #R18 1740: only with BBN seal_videos.zip
-    else: #A8, M4, R16, R19      
-      videos, video_test = train_test_split(videos, test_size=0.10, random_state=1030) 
+    elif "A8" in cfg.SKILLS[0]["NAME"]:
+      videos, video_test = train_test_split(videos, test_size=0.10, random_state=2329) #A8:
+    else: #M4, R16, R19      
+      videos, video_test = train_test_split(videos, test_size=0.10, random_state=1030)
 
   return videos, video_test