diff --git a/config/STEPGRU_BASE.yaml b/config/STEPGRU_BASE.yaml index cfc9ddb..45cec9e 100644 --- a/config/STEPGRU_BASE.yaml +++ b/config/STEPGRU_BASE.yaml @@ -1,6 +1,6 @@ MODEL: HIDDEN_SIZE: 1024 - CONTEXT_LENGTH: 'full' + PROJECTION_SIZE: 512 USE_ACTION: True ##default true USE_OBJECTS: True ##default true USE_AUDIO: False ##default false @@ -10,7 +10,6 @@ MODEL: OMNIVORE_CONFIG: '/home/user/data/config/OMNIVORE.yaml' SLOWFAST_CONFIG: '/home/user/data/auditory_slowfast/configs/BBN/SLOWFAST_R50.yaml' DATASET: - NAME: 'Milly' CLASS: 'Milly_multifeature_v4' LOCATION: '/frame' AUDIO_LOCATION: '/sound' diff --git a/step_recog/config/defaults.py b/step_recog/config/defaults.py index 49028e0..c504412 100644 --- a/step_recog/config/defaults.py +++ b/step_recog/config/defaults.py @@ -35,15 +35,16 @@ # ----------------------------------------------------------------------------- _C.MODEL = CfgNode() _C.MODEL.HIDDEN_SIZE = 1024 -_C.MODEL.CONTEXT_LENGTH = 'all' +_C.MODEL.PROJECTION_SIZE = 512 +##_C.MODEL.CONTEXT_LENGTH = 'all' _C.MODEL.OUTPUT_DIM = 33 -_C.MODEL.DRIVE_ID = "" +##_C.MODEL.DRIVE_ID = "" _C.MODEL.SKILLS = [] _C.MODEL.USE_ACTION = True _C.MODEL.USE_OBJECTS = True _C.MODEL.USE_AUDIO = False _C.MODEL.USE_BN = False -_C.MODEL.CHECKPOINT_FILE_PATH = '' +##_C.MODEL.CHECKPOINT_FILE_PATH = '' _C.MODEL.DROP_OUT = 0.2 _C.MODEL.YOLO_CHECKPOINT_URL = '' @@ -56,17 +57,17 @@ # ----------------------------------------------------------------------------- _C.DATASET = CfgNode() _C.DATASET.CLASS = 'Milly_multifeature_v4' -_C.DATASET.NAME = '' +##_C.DATASET.NAME = '' _C.DATASET.LOCATION = '' _C.DATASET.AUDIO_LOCATION = '' -_C.DATASET.VIDEO_LAYER = '' -_C.DATASET.OBJECT_FRAME_LOCATION = '' +##_C.DATASET.VIDEO_LAYER = '' +##_C.DATASET.OBJECT_FRAME_LOCATION = '' _C.DATASET.TR_ANNOTATIONS_FILE = '' _C.DATASET.VL_ANNOTATIONS_FILE = '' _C.DATASET.TS_ANNOTATIONS_FILE = '' -_C.DATASET.HOP_SIZE = 0.5 -_C.DATASET.FPS = 30 -_C.DATASET.WIN_LENGTH = 2 +##_C.DATASET.HOP_SIZE = 0.5 +##_C.DATASET.FPS = 30 +##_C.DATASET.WIN_LENGTH = 2 _C.DATASET.INCLUDE_IMAGE_AUGMENTATIONS = False _C.DATASET.INCLUDE_TIME_AUGMENTATIONS = False _C.DATASET.IMAGE_AUGMENTATION_PERCENTAGE = 0.5 #probability of applying image augmentation diff --git a/step_recog/datasets/milly.py b/step_recog/datasets/milly.py index 960c352..f1a5dbe 100644 --- a/step_recog/datasets/milly.py +++ b/step_recog/datasets/milly.py @@ -134,37 +134,9 @@ def _construct_loader(self, split): def __len__(self): return len(self.datapoints) -import sys +from step_recog import utils from collections import deque -##https://stackoverflow.com/questions/44131691/how-to-clear-cache-or-force-recompilation-in-numba -##https://numba.pydata.org/numba-doc/0.48.0/developer/caching.html#cache-clearing -##https://numba.pydata.org/numba-doc/0.48.0/reference/envvars.html#envvar-NUMBA_CACHE_DIR -#to save numba cache out the /home folder -main_cache_path = os.path.join("/vast", os.path.basename(os.path.expanduser("~"))) -clip_download_root = None -omni_path = os.path.join(os.path.expanduser("~"), ".cache/torch/hub/facebookresearch_omnivore_main") - -if os.path.isdir(main_cache_path): - cache_path = os.path.join(main_cache_path, "cache") - - if not os.path.isdir(cache_path): - os.mkdir(cache_path) - - numba.config.CACHE_DIR = cache_path #default: ~/.cache - clip_download_root = os.path.join(cache_path, "clip") #default: ~/.cache/clip - - cache_path = os.path.join(cache_path, "torch", "hub") - - if not os.path.isdir(cache_path): - os.makedirs(cache_path) - - torch.hub.set_dir(cache_path) #default: ~/.cache/torch/hub - omni_path = os.path.join(cache_path, "facebookresearch_omnivore_main") - -#to work with: torch.multiprocessing.set_start_method('spawn') -sys.path.append(omni_path) - from ultralytics import YOLO #from torch.quantization import quantize_dynamic @@ -216,7 +188,7 @@ def __init__(self, cfg, split='train', filter=None): self.yolo.eval = yolo_eval #to work with: torch.multiprocessing.set_start_method('spawn') # self.yolo = quantize_dynamic(self.yolo, {torch.nn.Linear, torch.nn.Conv2d}, dtype=torch.qint8) - self.clip_patches = ClipPatches(download_root=clip_download_root) + self.clip_patches = ClipPatches(download_root=utils.clip_download_root) self.clip_patches.eval() if self.cfg.MODEL.USE_ACTION: @@ -457,6 +429,7 @@ def _construct_loader(self, split): nframes = len(glob.glob(os.path.join(self.cfg.DATASET.LOCATION, v, "*.jpg"))) vid_ann = self._fill_gap(vid_ann.copy(), nframes) video_windows = [] + previous_stop_frame = 1 for _, step_ann in vid_ann.iterrows(): win_size = self.rng.integers(len(win_size_sec)) @@ -464,16 +437,17 @@ def _construct_loader(self, split): ##First window: starts in step_ann.start_frame - WINDOW SIZE and stops in step_ann.start_frame ##If it is training, chooses a stop in [ step_ann.start_frame, step_ann.start_frame + delta ] + ##If not, stop is always incremented by the hop_size ##start_frame < 0 is used to facilitate the process. Inside the loop it is always truncated to 1 and _getitem_ pads the begining of the window. high = min(step_ann.start_frame + start_delta, step_ann.stop_frame + 1) - stop_frame = self.rng.integers(low = step_ann.start_frame, high = high) if split == "train" else step_ann.start_frame + stop_frame = self.rng.integers(low = step_ann.start_frame, high = high) if split == "train" else previous_stop_frame start_frame = stop_frame - step_ann.video_fps * win_size_sec[win_size] + 1 stop_sound_point = 0 if step_ann.start_frame == 1 else int(self.slowfast_cfg.AUDIO_DATA.SAMPLING_RATE * step_ann.start_frame / step_ann.video_fps) start_sound_point = int(stop_sound_point - self.slowfast_cfg.AUDIO_DATA.SAMPLING_RATE * (win_size_sec[win_size] - 0.001)) #adjusted (-0.001) because of Slowfast set up - process_last_frames = stop_frame != step_ann.stop_frame + process_last_frames = stop_frame != step_ann.stop_frame if split == "train" else idx == (vid_ann.shape[0] - 1) win_idx = 0 while stop_frame <= step_ann.stop_frame: @@ -508,7 +482,7 @@ def _construct_loader(self, split): stop_frame = int(start_frame - 1 + step_ann.video_fps * win_size_sec[win_size]) start_sound_point += int(self.slowfast_cfg.AUDIO_DATA.SAMPLING_RATE * win_size_sec[win_size] * hop_size_perc[hop_size]) - stop_sound_point = int(start_sound_point + self.slowfast_cfg.AUDIO_DATA.SAMPLING_RATE * (win_size_sec[win_size] - 0.001)) #adjusted (-0.001) because of Slowfast set up + stop_sound_point = int(start_sound_point + self.slowfast_cfg.AUDIO_DATA.SAMPLING_RATE * (win_size_sec[win_size] - 0.001)) #adjusted (-0.001) because of Slowfast set up #Don't loose any frame in the end of the video. if previous_stop_frame < step_ann.stop_frame and start_frame < step_ann.stop_frame and step_ann.stop_frame < stop_frame and process_last_frames: @@ -520,6 +494,9 @@ def _construct_loader(self, split): stop_sound_point = int(self.slowfast_cfg.AUDIO_DATA.SAMPLING_RATE * step_ann.stop_frame / step_ann.video_fps) start_sound_point = int(stop_sound_point - self.slowfast_cfg.AUDIO_DATA.SAMPLING_RATE * (win_size_sec[win_size] - 0.001)) #adjusted (-0.001) because of Slowfast set up + ##If split != 'train', it guarantees that the next step stops in the right place + previous_stop_frame = stop_frame + self.datapoints[ipoint] = { 'video_id': v, 'windows': video_windows diff --git a/step_recog/full/model.py b/step_recog/full/model.py index 40d67b1..83a3cc6 100644 --- a/step_recog/full/model.py +++ b/step_recog/full/model.py @@ -12,6 +12,7 @@ from step_recog.config import load_config from step_recog.models import OmniGRU +from step_recog import utils from step_recog.full.clip_patches import ClipPatches from step_recog.full.download import cached_download_file @@ -48,15 +49,15 @@ def __init__(self, cfg_file, video_fps = 30): # build model self.head = OmniGRU(self.cfg, load=True) self.head.eval() - if self.head.use_action: + if self.cfg.MODEL.USE_ACTION: self.omnivore = Omnivore(self.omni_cfg) - if self.head.use_objects: + if self.cfg.MODEL.USE_OBJECTS: yolo_checkpoint = cached_download_file(self.cfg.MODEL.YOLO_CHECKPOINT_URL) self.yolo = YOLO(yolo_checkpoint) self.yolo.eval = lambda *a: None - self.clip_patches = ClipPatches() + self.clip_patches = ClipPatches(utils.clip_download_root) self.clip_patches.eval() - if self.head.use_audio: + if self.cfg.MODEL.USE_AUDIO: raise NotImplementedError() # frame buffers and model state @@ -70,7 +71,7 @@ def reset(self): def queue_frame(self, image): X_omnivore = image - if self.head.use_action: + if self.cfg.MODEL.USE_ACTION: X_omnivore = self.omnivore.prepare_image(image, bgr2rgb=False) if len(self.omnivore_input_queue) == 0: @@ -90,7 +91,7 @@ def prepare(self, im): def forward(self, image, queue_omni_frame = True): # compute yolo Z_objects, Z_frame = torch.zeros((1, 1, 25, 0)).float(), torch.zeros((1, 1, 1, 0)).float() - if self.head.use_objects: + if self.cfg.MODEL.USE_OBJECTS: results = self.yolo(image, verbose=False) boxes = results[0].boxes Z_clip = self.clip_patches(image, boxes.xywh.cpu().numpy(), include_frame=True) @@ -106,12 +107,12 @@ def forward(self, image, queue_omni_frame = True): # compute audio embeddings Z_audio = torch.zeros((1, 1, 0)).float() - if self.head.use_audio: + if self.cfg.MODEL.USE_AUDIO: Z_audio = None # compute video embeddings Z_action = torch.zeros((1, 1, 0)).float() - if self.head.use_action: + if self.cfg.MODEL.USE_ACTION: # rolling buffer of omnivore input frames if queue_omni_frame: self.queue_frame(image) diff --git a/step_recog/models.py b/step_recog/models.py index 5c319e0..03cfc91 100644 --- a/step_recog/models.py +++ b/step_recog/models.py @@ -3,7 +3,6 @@ #=========================================================================# import torch -import ipdb from collections import OrderedDict device = 'cuda' if torch.cuda.is_available() else 'cpu' @@ -20,42 +19,37 @@ def __init__(self, cfg, load = False): action_size = 1024 #default Omnivore output audio_size = 2304 #default Slowfast output img_size = 517 #default Clip output (512) + Yolo bouding box (4) + Yolo confidence (1) - project_dim = 512 #default space for each feature - - self.use_action = cfg.MODEL.USE_ACTION - self.use_objects = cfg.MODEL.USE_OBJECTS - self.use_audio = cfg.MODEL.USE_AUDIO - self.use_bn = cfg.MODEL.USE_BN - self.skills = cfg.MODEL.SKILLS - self.hidden_dim = cfg.MODEL.HIDDEN_SIZE - self.number_classes = cfg.MODEL.OUTPUT_DIM + 1 #adding no step + + self.cfg = cfg + self.number_classes = self.cfg.MODEL.OUTPUT_DIM + 1 #adding no step self.number_position = 2 # adding window position in a step to the output + self._device = torch.nn.Parameter(torch.empty(0)) - self.n_gru_layers = 2 + self.n_gru_layers = 2 gru_input_dim = 0 - if self.use_action: - gru_input_dim += project_dim - self.action_fc = torch.nn.Linear(action_size, project_dim) + if self.cfg.MODEL.USE_ACTION: + gru_input_dim += self.cfg.MODEL.PROJECTION_SIZE + self.action_fc = torch.nn.Linear(action_size, self.cfg.MODEL.PROJECTION_SIZE) self.action_drop_out = torch.nn.Dropout(cfg.MODEL.DROP_OUT) - if self.use_bn: - self.action_bn = torch.nn.BatchNorm1d(project_dim) + if self.cfg.MODEL.USE_BN: + self.action_bn = torch.nn.BatchNorm1d(self.cfg.MODEL.PROJECTION_SIZE) - if self.use_audio: - gru_input_dim += project_dim - self.audio_fc = torch.nn.Linear(audio_size, project_dim) + if self.cfg.MODEL.USE_AUDIO: + gru_input_dim += self.cfg.MODEL.PROJECTION_SIZE + self.audio_fc = torch.nn.Linear(audio_size, self.cfg.MODEL.PROJECTION_SIZE) self.audio_drop_out = torch.nn.Dropout(cfg.MODEL.DROP_OUT) - if self.use_bn: - self.aud_bn = torch.nn.BatchNorm1d(project_dim) - - if self.use_objects: - gru_input_dim += project_dim - self.obj_proj = torch.nn.Linear(img_size, project_dim) - self.frame_proj = torch.nn.Linear(img_size, project_dim) - self.obj_fc = torch.nn.Linear(project_dim, project_dim) + if self.cfg.MODEL.USE_BN: + self.aud_bn = torch.nn.BatchNorm1d(self.cfg.MODEL.PROJECTION_SIZE) + + if self.cfg.MODEL.USE_OBJECTS: + gru_input_dim += self.cfg.MODEL.PROJECTION_SIZE + self.obj_proj = torch.nn.Linear(img_size, self.cfg.MODEL.PROJECTION_SIZE) + self.frame_proj = torch.nn.Linear(img_size, self.cfg.MODEL.PROJECTION_SIZE) + self.obj_fc = torch.nn.Linear(self.cfg.MODEL.PROJECTION_SIZE, self.cfg.MODEL.PROJECTION_SIZE) self.obj_drop_out = torch.nn.Dropout(cfg.MODEL.DROP_OUT) - if self.use_bn: - self.obj_bn = torch.nn.BatchNorm1d(project_dim) + if self.cfg.MODEL.USE_BN: + self.obj_bn = torch.nn.BatchNorm1d(self.cfg.MODEL.PROJECTION_SIZE) if gru_input_dim == 0: raise Exception("GRU has to use at least one input (action, object/frame, or audio)") @@ -72,23 +66,23 @@ def __init__(self, cfg, load = False): def forward(self, action, h=None, aud=None, objs=None, frame=None, return_last_step=True): x = [] - if self.use_action: + if self.cfg.MODEL.USE_ACTION: action = self.action_fc(action) - if self.use_bn: + if self.cfg.MODEL.USE_BN: action = self.action_bn(action.transpose(1, 2)).transpose(1, 2) action = self.relu(action) action = self.action_drop_out(action) x.append(action) - if self.use_audio: + if self.cfg.MODEL.USE_AUDIO: aud = self.audio_fc(aud) - if self.use_bn: + if self.cfg.MODEL.USE_BN: aud = self.aud_bn(aud.transpose(1, 2)).transpose(1, 2) aud = self.relu(aud) aud = self.audio_drop_out(aud) x.append(aud) - if self.use_objects: + if self.cfg.MODEL.USE_OBJECTS: obj_proj = self.relu(self.obj_proj(objs)) frame_proj = self.relu(self.frame_proj(frame)) @@ -109,7 +103,7 @@ def forward(self, action, h=None, aud=None, objs=None, frame=None, return_last_s obj_in = torch.mean(obj_in, dim = -2) #=================================================================================# obj_in = self.obj_fc(obj_in) - if self.use_bn: + if self.cfg.MODEL.USE_BN: obj_in = self.obj_bn(obj_in.transpose(1, 2)).transpose(1, 2) obj_in = self.relu(obj_in) obj_in = self.obj_drop_out(obj_in) @@ -123,16 +117,23 @@ def forward(self, action, h=None, aud=None, objs=None, frame=None, return_last_s def init_hidden(self, batch_size): weight = next(self.parameters()).data - hidden = weight.new(self.n_gru_layers, batch_size, self.hidden_dim).zero_().to(device) + hidden = weight.new(self.n_gru_layers, batch_size, self.cfg.MODEL.HIDDEN_SIZE).zero_().to(self._device.device if self._device else device) return hidden def update_version(self, state_dict): new_dict = OrderedDict() + has_device = False for key, value in state_dict.items(): if "rgb" in key: key = key.replace("rgb", "action") new_dict[key] = value + + if "_device" in key: + has_device = True + if not has_device: + self._device = None + return new_dict