Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Video Question and Answering model JSFusion #61

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions config/jsfusion-whole.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"video_path_iterator": "models.jsfusion.model.JsFusionVideoPathIterator",
"pipeline": [
{
"model": "models.jsfusion.model.JsFusionLoader",
"gpus": [0]
},
{
"model": "models.jsfusion.model.ResNetRunner",
"gpus": [0]
},
{
"model": "models.jsfusion.model.MCModelRunner",
"gpus": [0]
}
]
}
Empty file added models/jsfusion/__init__.py
Empty file.
27 changes: 27 additions & 0 deletions models/jsfusion/attention.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import torch
import math

MIN_TIMESCALE=1.0
MAX_TIMESCALE=1.0e4

def add_timing_signal_nd(num_frames, video_channels):
shape = [1, num_frames, video_channels]
num_dims = len(shape) - 2
channels = shape[-1]

position = torch.tensor(range(num_frames), dtype=torch.float32)
position = torch.unsqueeze(position, dim=1)

num_timescales = channels // (num_dims * 2)
log_timescale_increment = math.log(MAX_TIMESCALE / MIN_TIMESCALE) / (num_timescales - 1)
inv_timescales = []
for i in range(num_timescales):
inv_timescales.append(1.0 * math.exp(-float(i) * log_timescale_increment))
inv_timescales = torch.tensor(inv_timescales, dtype=torch.float32)
inv_timescales = torch.unsqueeze(inv_timescales, dim=0)

scaled_time = position.matmul(inv_timescales)
signal = torch.cat([scaled_time.sin(), scaled_time.cos()], dim=1)
signal = torch.unsqueeze(signal, 0)

return signal
211 changes: 211 additions & 0 deletions models/jsfusion/data_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
"""Utility class used in JSFusion model, copied from the original author's code
https://github.com/yj-yu/lsmdc/blob/master/videocap/datasets/data_util.py
"""
import time
import numpy as np
import re


def clean_str(string, downcase=True):
"""Tokenization/string cleaning for strings.

Taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`(_____)]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower() if downcase else string.strip()

def recover_word(string):
string = re.sub(r" \'s", "\'s", string)
string = re.sub(r" ,", ",", string)
return string

def clean_blank(blank_sent):
"""Tokenizes and changes _____ to <START>
<START> would be Answer position in FIB work.
"""
clean_sent = clean_str(blank_sent).split()
return ['<START>' if x == '_____' else x for x in clean_sent]


def clean_root(string):
"""Removes unexpected character in root.
"""
return string


def pad_sequences(sequences, pad_token="[PAD]", pad_location="LEFT", max_length=None):
"""Pads all sequences to the same length. The length is defined by the longest sequence.
Returns padded sequences.
"""
if not max_length:
max_length = max(len(x) for x in sequences)

result = []
for i in range(len(sequences)):
sentence = sequences[i]
num_padding = max_length - len(sentence)
if num_padding == 0:
new_sentence = sentence
elif num_padding < 0:
new_sentence = sentence[:num_padding]
elif pad_location == "RIGHT":
new_sentence = sentence + [pad_token] * num_padding
elif pad_location == "LEFT":
new_sentence = [pad_token] * num_padding + sentence
else:
print("Invalid pad_location. Specify LEFT or RIGHT.")
result.append(new_sentence)
return result


def convert_sent_to_index(sentence, word_to_index):
"""Converts sentence consisting of string to indexed sentence.
"""
return [word_to_index[word] if word in word_to_index.keys() else 0 for word in sentence]


def batch_iter(data, batch_size, seed=None, fill=True):
"""Generates a batch iterator for a dataset.
"""
random = np.random.RandomState(seed)
data_length = len(data)
num_batches = int(data_length / batch_size)
if data_length % batch_size != 0:
num_batches += 1

# Shuffle the data at each epoch
shuffle_indices = random.permutation(np.arange(data_length))
for batch_num in range(num_batches):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, data_length)
selected_indices = shuffle_indices[start_index:end_index]
# If we don't have enough data left for a whole batch, fill it randomly
if fill and end_index >= data_length:
num_missing = batch_size - len(selected_indices)
selected_indices = np.concatenate([selected_indices, random.randint(0, data_length, num_missing)])
yield [data[i] for i in selected_indices]


def fsr_iter(fsr_data, batch_size, random_seed=42, fill=True):
"""fsr_data: one of LSMDCData.build_data(), [[video_features], [sentences], [roots]]
return per iter: [[feature]*batch_size, [sentences]*batch_size, [roots]*batch]

Usage:
train_data, val_data, test_data = LSMDCData.build_data()
for features, sentences, roots in fsr_iter(train_data, 20, 10):
feed_dict = {model.video_feature : features,
model.sentences : sentences,
model.roots : roots}
"""

train_iter = batch_iter(list(zip(*fsr_data)), batch_size, fill=fill, seed=random_seed)
return map(lambda batch: zip(*batch), train_iter)


def preprocess_sents(descriptions, word_to_index, max_length):
descriptions = [clean_str(sent).split() for sent in descriptions]
# Add padding on the right to each sentence in order to keep the same lengths.
descriptions = pad_sequences(descriptions, max_length=max_length)
# Convert sentences from a list of string to the list of indices (int)
descriptions = [convert_sent_to_index(sent, word_to_index) for sent in descriptions]

return descriptions
# remove punctuation mark and special chars from root.


def preprocess_roots(roots, word_to_index):
roots = [clean_root(root) for root in roots]
# convert string to int index.
roots = [word_to_index[root] if root in word_to_index.keys() else 0 for root in roots]

return roots


def pad_video(video_feature, dimension, padded_feature=None):
"""Fills pad to video to have same length.
Pad in Left.
video = [pad,..., pad, frm1, frm2, ..., frmN]
"""
if padded_feature is None:
padded_feature = np.zeros(dimension, dtype=np.float32)
max_length = dimension[0]
current_length = video_feature.shape[0]
num_padding = max_length - current_length
if num_padding == 0:
padded_feature[:] = video_feature
elif num_padding < 0:
steps = np.linspace(0, current_length, num=max_length, endpoint=False, dtype=np.int32)
padded_feature[:] = video_feature[steps]
else:
# about 0.7 sec
padded_feature[num_padding:] = video_feature

return padded_feature

def repeat_pad_video(video_feature, dimension):
padded_feature = np.zeros(dimension, dtype= np.float)
max_length = dimension[0]
current_length = video_feature.shape[0]

if current_length == max_length:
padded_feature[:] = video_feature

elif current_length < max_length:
tile_num = int(max_length / current_length)
to_tile = np.ones(len(dimension), dtype=np.int32)
to_tile[0] = tile_num
remainder = max_length % current_length
tiled_vid = np.tile(video_feature, to_tile)
if remainder > 0:
padded_feature[0:remainder] = video_feature[-remainder:]
padded_feature[remainder:] = tiled_vid

else:
steps = np.linspace(0, current_length, num=max_length, endpoint=False, dtype=np.int32)
padded_feature[:] = video_feature[steps]
return padded_feature

def stretch_pad_video(video_feature, dimension):
padded_feature = np.zeros(dimension, dtype= np.float)
max_length = dimension[0]
current_length = video_feature.shape[0]

if current_length == max_length:
padded_feature[:] = video_feature
elif current_length < max_length:
repeat_num = int((max_length-1) / current_length)+1
tiled_vid = np.repeat(video_feature, repeat_num,0)
steps = np.linspace(0, repeat_num*current_length, num=max_length, endpoint=False, dtype=np.int32)
padded_feature[:] = tiled_vid[steps]
else:
steps = np.linspace(0, current_length, num=max_length, endpoint=False, dtype=np.int32)
padded_feature[:] = video_feature[steps]
return padded_feature


def fill_mask(max_length, current_length, zero_location='LEFT'):
num_padding = max_length - current_length
if num_padding <= 0:
mask = np.ones(max_length)
elif zero_location == 'LEFT':
mask = np.ones(max_length)
for i in range(num_padding):
mask[i] = 0
elif zero_location == 'RIGHT':
mask = np.zeros(max_length)
for i in range(current_length):
mask[i] = 1

return mask
121 changes: 121 additions & 0 deletions models/jsfusion/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
from models.jsfusion.module import ResNetFeatureExtractor
from models.jsfusion.module import MCModel
from models.jsfusion.sampler import FixedSampler

from runner_model import RunnerModel
from video_path_provider import VideoPathIterator
from itertools import cycle
from torchvision import transforms
import torch
import nvvl
import os

class JsFusionVideoPathIterator(VideoPathIterator):
def __init__(self):
super(JsFusionVideoPathIterator, self).__init__()

videos = []
video_dir = os.path.join(os.environ['LSMDC_PATH'], 'mp4s')
for video in os.listdir(video_dir):
videos.append(os.path.join(video_dir, video))

if len(videos) <= 0:
raise Exception('No video available.')

self.videos_iter = cycle(videos)

def __iter__(self):
return self.videos_iter

class JsFusionLoader(RunnerModel):
"""Impl of loading video frames using NVVL, for the R(2+1)D model."""
def __init__(self, device):
self.loader = nvvl.RnBLoader(width=224, height=224,
consecutive_frames=1, device_id=device.index,
sampler=FixedSampler(num_frames=40))

samples = [
os.path.join(os.environ['LSMDC_PATH'], 'mp4s/1004_Juno_00.00.32.849-00.00.35.458.mp4'),
os.path.join(os.environ['LSMDC_PATH'], 'mp4s/1004_Juno_00.00.35.642-00.00.45.231.mp4'),
os.path.join(os.environ['LSMDC_PATH'], 'mp4s/1004_Juno_00.00.49.801-00.00.59.450.mp4')]

# warm up GPU with a few inferences
for sample in samples:
self.loader.loadfile(sample)
for frames in self.loader:
pass
self.loader.flush()

def __call__(self, input):
_, file_path = input
self.loader.loadfile(file_path)
for frames in self.loader:
pass
self.loader.flush()


# frames: (40, 3, 1, 224, 224)
frames = frames.float()
frames = frames.permute(0, 2, 1, 3, 4)

transform = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
frames_tmp = []
for frame in frames:
frame = torch.squeeze(frame)
frame /= 255
frame = transform(frame)
frames_tmp.append(frame)
frames = torch.stack(frames_tmp)
# frames: (40, 3, 224, 224)

filename = os.path.basename(file_path)
out = (frames, filename)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should be ((frames,), filename). The first item needs to be a tuple of tensors, not a single tensor.

return out

def __del__(self):
self.loader.close()

def input_shape(self):
return None

@staticmethod
def output_shape():
return ((40, 3, 224, 224),)


class ResNetRunner(RunnerModel):
def __init__(self, device, num_frames = 40):
super(ResNetRunner, self).__init__(device)
self.model = ResNetFeatureExtractor(num_frames).to(device)
self.model.float()
self.model.eval()

def input_shape(self):
return ((40, 3, 224, 224),)

@staticmethod
def output_shape():
return ((1, 40, 2048),)

def __call__(self, input):
return self.model(input)


class MCModelRunner(RunnerModel):
def __init__(self, device, num_frames = 40):
super(MCModelRunner, self).__init__(device)
self.model = MCModel(device).to(device)
self.model.float()
self.model.eval()

def input_shape(self):
return ((1, 40, 2048),)

def __call__(self, input):
return self.model(input)

@staticmethod
def output_shape():
return ((1,),)

Loading