Skip to content

Commit

Permalink
Merge branch 'fabiofelix:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
fabiofelix authored Jul 12, 2024
2 parents 6bb4565 + 72796ef commit df5c398
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 88 deletions.
3 changes: 1 addition & 2 deletions config/STEPGRU_BASE.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MODEL:
HIDDEN_SIZE: 1024
CONTEXT_LENGTH: 'full'
PROJECTION_SIZE: 512
USE_ACTION: True ##default true
USE_OBJECTS: True ##default true
USE_AUDIO: False ##default false
Expand All @@ -10,7 +10,6 @@ MODEL:
OMNIVORE_CONFIG: '/home/user/data/config/OMNIVORE.yaml'
SLOWFAST_CONFIG: '/home/user/data/auditory_slowfast/configs/BBN/SLOWFAST_R50.yaml'
DATASET:
NAME: 'Milly'
CLASS: 'Milly_multifeature_v4'
LOCATION: '/frame'
AUDIO_LOCATION: '/sound'
Expand Down
19 changes: 10 additions & 9 deletions step_recog/config/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,16 @@
# -----------------------------------------------------------------------------
_C.MODEL = CfgNode()
_C.MODEL.HIDDEN_SIZE = 1024
_C.MODEL.CONTEXT_LENGTH = 'all'
_C.MODEL.PROJECTION_SIZE = 512
##_C.MODEL.CONTEXT_LENGTH = 'all'
_C.MODEL.OUTPUT_DIM = 33
_C.MODEL.DRIVE_ID = ""
##_C.MODEL.DRIVE_ID = ""
_C.MODEL.SKILLS = []
_C.MODEL.USE_ACTION = True
_C.MODEL.USE_OBJECTS = True
_C.MODEL.USE_AUDIO = False
_C.MODEL.USE_BN = False
_C.MODEL.CHECKPOINT_FILE_PATH = ''
##_C.MODEL.CHECKPOINT_FILE_PATH = ''
_C.MODEL.DROP_OUT = 0.2

_C.MODEL.YOLO_CHECKPOINT_URL = ''
Expand All @@ -56,17 +57,17 @@
# -----------------------------------------------------------------------------
_C.DATASET = CfgNode()
_C.DATASET.CLASS = 'Milly_multifeature_v4'
_C.DATASET.NAME = ''
##_C.DATASET.NAME = ''
_C.DATASET.LOCATION = ''
_C.DATASET.AUDIO_LOCATION = ''
_C.DATASET.VIDEO_LAYER = ''
_C.DATASET.OBJECT_FRAME_LOCATION = ''
##_C.DATASET.VIDEO_LAYER = ''
##_C.DATASET.OBJECT_FRAME_LOCATION = ''
_C.DATASET.TR_ANNOTATIONS_FILE = ''
_C.DATASET.VL_ANNOTATIONS_FILE = ''
_C.DATASET.TS_ANNOTATIONS_FILE = ''
_C.DATASET.HOP_SIZE = 0.5
_C.DATASET.FPS = 30
_C.DATASET.WIN_LENGTH = 2
##_C.DATASET.HOP_SIZE = 0.5
##_C.DATASET.FPS = 30
##_C.DATASET.WIN_LENGTH = 2
_C.DATASET.INCLUDE_IMAGE_AUGMENTATIONS = False
_C.DATASET.INCLUDE_TIME_AUGMENTATIONS = False
_C.DATASET.IMAGE_AUGMENTATION_PERCENTAGE = 0.5 #probability of applying image augmentation
Expand Down
43 changes: 10 additions & 33 deletions step_recog/datasets/milly.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,37 +134,9 @@ def _construct_loader(self, split):
def __len__(self):
return len(self.datapoints)

import sys
from step_recog import utils
from collections import deque

##https://stackoverflow.com/questions/44131691/how-to-clear-cache-or-force-recompilation-in-numba
##https://numba.pydata.org/numba-doc/0.48.0/developer/caching.html#cache-clearing
##https://numba.pydata.org/numba-doc/0.48.0/reference/envvars.html#envvar-NUMBA_CACHE_DIR
#to save numba cache out the /home folder
main_cache_path = os.path.join("/vast", os.path.basename(os.path.expanduser("~")))
clip_download_root = None
omni_path = os.path.join(os.path.expanduser("~"), ".cache/torch/hub/facebookresearch_omnivore_main")

if os.path.isdir(main_cache_path):
cache_path = os.path.join(main_cache_path, "cache")

if not os.path.isdir(cache_path):
os.mkdir(cache_path)

numba.config.CACHE_DIR = cache_path #default: ~/.cache
clip_download_root = os.path.join(cache_path, "clip") #default: ~/.cache/clip

cache_path = os.path.join(cache_path, "torch", "hub")

if not os.path.isdir(cache_path):
os.makedirs(cache_path)

torch.hub.set_dir(cache_path) #default: ~/.cache/torch/hub
omni_path = os.path.join(cache_path, "facebookresearch_omnivore_main")

#to work with: torch.multiprocessing.set_start_method('spawn')
sys.path.append(omni_path)

from ultralytics import YOLO
#from torch.quantization import quantize_dynamic

Expand Down Expand Up @@ -216,7 +188,7 @@ def __init__(self, cfg, split='train', filter=None):
self.yolo.eval = yolo_eval #to work with: torch.multiprocessing.set_start_method('spawn')
# self.yolo = quantize_dynamic(self.yolo, {torch.nn.Linear, torch.nn.Conv2d}, dtype=torch.qint8)

self.clip_patches = ClipPatches(download_root=clip_download_root)
self.clip_patches = ClipPatches(download_root=utils.clip_download_root)
self.clip_patches.eval()

if self.cfg.MODEL.USE_ACTION:
Expand Down Expand Up @@ -457,23 +429,25 @@ def _construct_loader(self, split):
nframes = len(glob.glob(os.path.join(self.cfg.DATASET.LOCATION, v, "*.jpg")))
vid_ann = self._fill_gap(vid_ann.copy(), nframes)
video_windows = []
previous_stop_frame = 1

for _, step_ann in vid_ann.iterrows():
win_size = self.rng.integers(len(win_size_sec))
hop_size = self.rng.integers(len(hop_size_perc))

##First window: starts in step_ann.start_frame - WINDOW SIZE and stops in step_ann.start_frame
##If it is training, chooses a stop in [ step_ann.start_frame, step_ann.start_frame + delta ]
##If not, stop is always incremented by the hop_size
##start_frame < 0 is used to facilitate the process. Inside the loop it is always truncated to 1 and _getitem_ pads the begining of the window.
high = min(step_ann.start_frame + start_delta, step_ann.stop_frame + 1)
stop_frame = self.rng.integers(low = step_ann.start_frame, high = high) if split == "train" else step_ann.start_frame
stop_frame = self.rng.integers(low = step_ann.start_frame, high = high) if split == "train" else previous_stop_frame

start_frame = stop_frame - step_ann.video_fps * win_size_sec[win_size] + 1

stop_sound_point = 0 if step_ann.start_frame == 1 else int(self.slowfast_cfg.AUDIO_DATA.SAMPLING_RATE * step_ann.start_frame / step_ann.video_fps)
start_sound_point = int(stop_sound_point - self.slowfast_cfg.AUDIO_DATA.SAMPLING_RATE * (win_size_sec[win_size] - 0.001)) #adjusted (-0.001) because of Slowfast set up

process_last_frames = stop_frame != step_ann.stop_frame
process_last_frames = stop_frame != step_ann.stop_frame if split == "train" else idx == (vid_ann.shape[0] - 1)
win_idx = 0

while stop_frame <= step_ann.stop_frame:
Expand Down Expand Up @@ -508,7 +482,7 @@ def _construct_loader(self, split):
stop_frame = int(start_frame - 1 + step_ann.video_fps * win_size_sec[win_size])

start_sound_point += int(self.slowfast_cfg.AUDIO_DATA.SAMPLING_RATE * win_size_sec[win_size] * hop_size_perc[hop_size])
stop_sound_point = int(start_sound_point + self.slowfast_cfg.AUDIO_DATA.SAMPLING_RATE * (win_size_sec[win_size] - 0.001)) #adjusted (-0.001) because of Slowfast set up
stop_sound_point = int(start_sound_point + self.slowfast_cfg.AUDIO_DATA.SAMPLING_RATE * (win_size_sec[win_size] - 0.001)) #adjusted (-0.001) because of Slowfast set up

#Don't loose any frame in the end of the video.
if previous_stop_frame < step_ann.stop_frame and start_frame < step_ann.stop_frame and step_ann.stop_frame < stop_frame and process_last_frames:
Expand All @@ -520,6 +494,9 @@ def _construct_loader(self, split):
stop_sound_point = int(self.slowfast_cfg.AUDIO_DATA.SAMPLING_RATE * step_ann.stop_frame / step_ann.video_fps)
start_sound_point = int(stop_sound_point - self.slowfast_cfg.AUDIO_DATA.SAMPLING_RATE * (win_size_sec[win_size] - 0.001)) #adjusted (-0.001) because of Slowfast set up

##If split != 'train', it guarantees that the next step stops in the right place
previous_stop_frame = stop_frame

self.datapoints[ipoint] = {
'video_id': v,
'windows': video_windows
Expand Down
17 changes: 9 additions & 8 deletions step_recog/full/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from step_recog.config import load_config
from step_recog.models import OmniGRU
from step_recog import utils

from step_recog.full.clip_patches import ClipPatches
from step_recog.full.download import cached_download_file
Expand Down Expand Up @@ -48,15 +49,15 @@ def __init__(self, cfg_file, video_fps = 30):
# build model
self.head = OmniGRU(self.cfg, load=True)
self.head.eval()
if self.head.use_action:
if self.cfg.MODEL.USE_ACTION:
self.omnivore = Omnivore(self.omni_cfg)
if self.head.use_objects:
if self.cfg.MODEL.USE_OBJECTS:
yolo_checkpoint = cached_download_file(self.cfg.MODEL.YOLO_CHECKPOINT_URL)
self.yolo = YOLO(yolo_checkpoint)
self.yolo.eval = lambda *a: None
self.clip_patches = ClipPatches()
self.clip_patches = ClipPatches(utils.clip_download_root)
self.clip_patches.eval()
if self.head.use_audio:
if self.cfg.MODEL.USE_AUDIO:
raise NotImplementedError()

# frame buffers and model state
Expand All @@ -70,7 +71,7 @@ def reset(self):
def queue_frame(self, image):
X_omnivore = image

if self.head.use_action:
if self.cfg.MODEL.USE_ACTION:
X_omnivore = self.omnivore.prepare_image(image, bgr2rgb=False)

if len(self.omnivore_input_queue) == 0:
Expand All @@ -90,7 +91,7 @@ def prepare(self, im):
def forward(self, image, queue_omni_frame = True):
# compute yolo
Z_objects, Z_frame = torch.zeros((1, 1, 25, 0)).float(), torch.zeros((1, 1, 1, 0)).float()
if self.head.use_objects:
if self.cfg.MODEL.USE_OBJECTS:
results = self.yolo(image, verbose=False)
boxes = results[0].boxes
Z_clip = self.clip_patches(image, boxes.xywh.cpu().numpy(), include_frame=True)
Expand All @@ -106,12 +107,12 @@ def forward(self, image, queue_omni_frame = True):

# compute audio embeddings
Z_audio = torch.zeros((1, 1, 0)).float()
if self.head.use_audio:
if self.cfg.MODEL.USE_AUDIO:
Z_audio = None

# compute video embeddings
Z_action = torch.zeros((1, 1, 0)).float()
if self.head.use_action:
if self.cfg.MODEL.USE_ACTION:
# rolling buffer of omnivore input frames
if queue_omni_frame:
self.queue_frame(image)
Expand Down
73 changes: 37 additions & 36 deletions step_recog/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
#=========================================================================#

import torch
import ipdb
from collections import OrderedDict

device = 'cuda' if torch.cuda.is_available() else 'cpu'
Expand All @@ -20,42 +19,37 @@ def __init__(self, cfg, load = False):
action_size = 1024 #default Omnivore output
audio_size = 2304 #default Slowfast output
img_size = 517 #default Clip output (512) + Yolo bouding box (4) + Yolo confidence (1)
project_dim = 512 #default space for each feature

self.use_action = cfg.MODEL.USE_ACTION
self.use_objects = cfg.MODEL.USE_OBJECTS
self.use_audio = cfg.MODEL.USE_AUDIO
self.use_bn = cfg.MODEL.USE_BN
self.skills = cfg.MODEL.SKILLS
self.hidden_dim = cfg.MODEL.HIDDEN_SIZE
self.number_classes = cfg.MODEL.OUTPUT_DIM + 1 #adding no step

self.cfg = cfg
self.number_classes = self.cfg.MODEL.OUTPUT_DIM + 1 #adding no step
self.number_position = 2 # adding window position in a step to the output
self._device = torch.nn.Parameter(torch.empty(0))

self.n_gru_layers = 2
self.n_gru_layers = 2
gru_input_dim = 0

if self.use_action:
gru_input_dim += project_dim
self.action_fc = torch.nn.Linear(action_size, project_dim)
if self.cfg.MODEL.USE_ACTION:
gru_input_dim += self.cfg.MODEL.PROJECTION_SIZE
self.action_fc = torch.nn.Linear(action_size, self.cfg.MODEL.PROJECTION_SIZE)
self.action_drop_out = torch.nn.Dropout(cfg.MODEL.DROP_OUT)
if self.use_bn:
self.action_bn = torch.nn.BatchNorm1d(project_dim)
if self.cfg.MODEL.USE_BN:
self.action_bn = torch.nn.BatchNorm1d(self.cfg.MODEL.PROJECTION_SIZE)

if self.use_audio:
gru_input_dim += project_dim
self.audio_fc = torch.nn.Linear(audio_size, project_dim)
if self.cfg.MODEL.USE_AUDIO:
gru_input_dim += self.cfg.MODEL.PROJECTION_SIZE
self.audio_fc = torch.nn.Linear(audio_size, self.cfg.MODEL.PROJECTION_SIZE)
self.audio_drop_out = torch.nn.Dropout(cfg.MODEL.DROP_OUT)
if self.use_bn:
self.aud_bn = torch.nn.BatchNorm1d(project_dim)

if self.use_objects:
gru_input_dim += project_dim
self.obj_proj = torch.nn.Linear(img_size, project_dim)
self.frame_proj = torch.nn.Linear(img_size, project_dim)
self.obj_fc = torch.nn.Linear(project_dim, project_dim)
if self.cfg.MODEL.USE_BN:
self.aud_bn = torch.nn.BatchNorm1d(self.cfg.MODEL.PROJECTION_SIZE)

if self.cfg.MODEL.USE_OBJECTS:
gru_input_dim += self.cfg.MODEL.PROJECTION_SIZE
self.obj_proj = torch.nn.Linear(img_size, self.cfg.MODEL.PROJECTION_SIZE)
self.frame_proj = torch.nn.Linear(img_size, self.cfg.MODEL.PROJECTION_SIZE)
self.obj_fc = torch.nn.Linear(self.cfg.MODEL.PROJECTION_SIZE, self.cfg.MODEL.PROJECTION_SIZE)
self.obj_drop_out = torch.nn.Dropout(cfg.MODEL.DROP_OUT)
if self.use_bn:
self.obj_bn = torch.nn.BatchNorm1d(project_dim)
if self.cfg.MODEL.USE_BN:
self.obj_bn = torch.nn.BatchNorm1d(self.cfg.MODEL.PROJECTION_SIZE)

if gru_input_dim == 0:
raise Exception("GRU has to use at least one input (action, object/frame, or audio)")
Expand All @@ -72,23 +66,23 @@ def __init__(self, cfg, load = False):
def forward(self, action, h=None, aud=None, objs=None, frame=None, return_last_step=True):
x = []

if self.use_action:
if self.cfg.MODEL.USE_ACTION:
action = self.action_fc(action)
if self.use_bn:
if self.cfg.MODEL.USE_BN:
action = self.action_bn(action.transpose(1, 2)).transpose(1, 2)
action = self.relu(action)
action = self.action_drop_out(action)
x.append(action)

if self.use_audio:
if self.cfg.MODEL.USE_AUDIO:
aud = self.audio_fc(aud)
if self.use_bn:
if self.cfg.MODEL.USE_BN:
aud = self.aud_bn(aud.transpose(1, 2)).transpose(1, 2)
aud = self.relu(aud)
aud = self.audio_drop_out(aud)
x.append(aud)

if self.use_objects:
if self.cfg.MODEL.USE_OBJECTS:
obj_proj = self.relu(self.obj_proj(objs))
frame_proj = self.relu(self.frame_proj(frame))

Expand All @@ -109,7 +103,7 @@ def forward(self, action, h=None, aud=None, objs=None, frame=None, return_last_s
obj_in = torch.mean(obj_in, dim = -2)
#=================================================================================#
obj_in = self.obj_fc(obj_in)
if self.use_bn:
if self.cfg.MODEL.USE_BN:
obj_in = self.obj_bn(obj_in.transpose(1, 2)).transpose(1, 2)
obj_in = self.relu(obj_in)
obj_in = self.obj_drop_out(obj_in)
Expand All @@ -123,16 +117,23 @@ def forward(self, action, h=None, aud=None, objs=None, frame=None, return_last_s

def init_hidden(self, batch_size):
weight = next(self.parameters()).data
hidden = weight.new(self.n_gru_layers, batch_size, self.hidden_dim).zero_().to(device)
hidden = weight.new(self.n_gru_layers, batch_size, self.cfg.MODEL.HIDDEN_SIZE).zero_().to(self._device.device if self._device else device)
return hidden

def update_version(self, state_dict):
new_dict = OrderedDict()
has_device = False

for key, value in state_dict.items():
if "rgb" in key:
key = key.replace("rgb", "action")

new_dict[key] = value

if "_device" in key:
has_device = True

if not has_device:
self._device = None

return new_dict

0 comments on commit df5c398

Please sign in to comment.