snuspl · yunseong · Jul 18, 2019 · Jul 18, 2019 · Jul 18, 2019 · jsjason
diff --git a/config/jsfusion-whole.json b/config/jsfusion-whole.json
@@ -0,0 +1,17 @@
+{
+  "video_path_iterator": "models.jsfusion.model.JsFusionVideoPathIterator",
+  "pipeline": [
+  {
+      "model": "models.jsfusion.model.JsFusionLoader",
+      "gpus": [0]
+  },
+  {
+      "model": "models.jsfusion.model.ResNetRunner",
+      "gpus": [0]
+  },
+  {
+      "model": "models.jsfusion.model.MCModelRunner",
+      "gpus": [0]
+  }
+  ]
+}
diff --git a/models/jsfusion/__init__.py b/models/jsfusion/__init__.py
diff --git a/models/jsfusion/attention.py b/models/jsfusion/attention.py
@@ -0,0 +1,27 @@
+import torch
+import math
+
+MIN_TIMESCALE=1.0
+MAX_TIMESCALE=1.0e4
+
+def add_timing_signal_nd(num_frames, video_channels):
+  shape = [1, num_frames, video_channels]
+  num_dims = len(shape) - 2
+  channels = shape[-1]
+
+  position = torch.tensor(range(num_frames), dtype=torch.float32)
+  position = torch.unsqueeze(position, dim=1)
+
+  num_timescales = channels // (num_dims * 2)
+  log_timescale_increment = math.log(MAX_TIMESCALE / MIN_TIMESCALE) / (num_timescales - 1)
+  inv_timescales = []
+  for i in range(num_timescales):
+    inv_timescales.append(1.0 * math.exp(-float(i) * log_timescale_increment))
+  inv_timescales = torch.tensor(inv_timescales, dtype=torch.float32)
+  inv_timescales = torch.unsqueeze(inv_timescales, dim=0)
+
+  scaled_time = position.matmul(inv_timescales)
+  signal = torch.cat([scaled_time.sin(), scaled_time.cos()], dim=1)
+  signal = torch.unsqueeze(signal, 0)
+
+  return signal
diff --git a/models/jsfusion/data_util.py b/models/jsfusion/data_util.py
@@ -0,0 +1,211 @@
+"""Utility class used in JSFusion model, copied from the original author's code
+https://github.com/yj-yu/lsmdc/blob/master/videocap/datasets/data_util.py
+"""
+import time
+import numpy as np
+import re
+
+
+def clean_str(string, downcase=True):
+  """Tokenization/string cleaning for strings.
+
+  Taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
+  """
+  string = re.sub(r"[^A-Za-z0-9(),!?\'\`(_____)]", " ", string)
+  string = re.sub(r"\'s", " \'s", string)
+  string = re.sub(r"\'ve", " \'ve", string)
+  string = re.sub(r"n\'t", " n\'t", string)
+  string = re.sub(r"\'re", " \'re", string)
+  string = re.sub(r"\'d", " \'d", string)
+  string = re.sub(r"\'ll", " \'ll", string)
+  string = re.sub(r",", " , ", string)
+  string = re.sub(r"!", " ! ", string)
+  string = re.sub(r"\(", " \( ", string)
+  string = re.sub(r"\)", " \) ", string)
+  string = re.sub(r"\?", " \? ", string)
+  string = re.sub(r"\s{2,}", " ", string)
+  return string.strip().lower() if downcase else string.strip()
+
+def recover_word(string):
+  string = re.sub(r" \'s", "\'s", string)
+  string = re.sub(r" ,", ",", string)
+  return string
+
+def clean_blank(blank_sent):
+  """Tokenizes and changes _____ to <START>
+  <START> would be Answer position in FIB work.
+  """
+  clean_sent = clean_str(blank_sent).split()
+  return ['<START>' if x == '_____' else x for x in clean_sent]
+
+
+def clean_root(string):
+  """Removes unexpected character in root.
+  """
+  return string
+
+
+def pad_sequences(sequences, pad_token="[PAD]", pad_location="LEFT", max_length=None):
+  """Pads all sequences to the same length. The length is defined by the longest sequence.
+  Returns padded sequences.
+  """
+  if not max_length:
+    max_length = max(len(x) for x in sequences)
+
+  result = []
+  for i in range(len(sequences)):
+    sentence = sequences[i]
+    num_padding = max_length - len(sentence)
+    if num_padding == 0:
+      new_sentence = sentence
+    elif num_padding < 0:
+      new_sentence = sentence[:num_padding]
+    elif pad_location == "RIGHT":
+      new_sentence = sentence + [pad_token] * num_padding
+    elif pad_location == "LEFT":
+      new_sentence = [pad_token] * num_padding + sentence
+    else:
+      print("Invalid pad_location. Specify LEFT or RIGHT.")
+    result.append(new_sentence)
+  return result
+
+
+def convert_sent_to_index(sentence, word_to_index):
+  """Converts sentence consisting of string to indexed sentence.
+  """
+  return [word_to_index[word] if word in word_to_index.keys() else 0 for word in sentence]
+
+
+def batch_iter(data, batch_size, seed=None, fill=True):
+  """Generates a batch iterator for a dataset.
+  """
+  random = np.random.RandomState(seed)
+  data_length = len(data)
+  num_batches = int(data_length / batch_size)
+  if data_length % batch_size != 0:
+    num_batches += 1
+
+  # Shuffle the data at each epoch
+  shuffle_indices = random.permutation(np.arange(data_length))
+  for batch_num in range(num_batches):
+    start_index = batch_num * batch_size
+    end_index = min((batch_num + 1) * batch_size, data_length)
+    selected_indices = shuffle_indices[start_index:end_index]
+    # If we don't have enough data left for a whole batch, fill it randomly
+    if fill and end_index >= data_length:
+      num_missing = batch_size - len(selected_indices)
+      selected_indices = np.concatenate([selected_indices, random.randint(0, data_length, num_missing)])
+    yield [data[i] for i in selected_indices]
+
+
+def fsr_iter(fsr_data, batch_size, random_seed=42, fill=True):
+  """fsr_data: one of LSMDCData.build_data(), [[video_features], [sentences], [roots]]
+  return per iter: [[feature]*batch_size, [sentences]*batch_size, [roots]*batch]
+
+  Usage:
+    train_data, val_data, test_data = LSMDCData.build_data()
+    for features, sentences, roots in fsr_iter(train_data, 20, 10):
+      feed_dict = {model.video_feature : features,
+                   model.sentences : sentences,
+                   model.roots : roots}
+  """
+
+  train_iter = batch_iter(list(zip(*fsr_data)), batch_size, fill=fill, seed=random_seed)
+  return map(lambda batch: zip(*batch), train_iter)
+
+
+def preprocess_sents(descriptions, word_to_index, max_length):
+  descriptions = [clean_str(sent).split() for sent in descriptions]
+  # Add padding on the right to each sentence in order to keep the same lengths.
+  descriptions = pad_sequences(descriptions, max_length=max_length)
+  # Convert sentences from a list of string to the list of indices (int)
+  descriptions = [convert_sent_to_index(sent, word_to_index) for sent in descriptions]
+
+  return descriptions
+  # remove punctuation mark and special chars from root.
+
+
+def preprocess_roots(roots, word_to_index):
+  roots = [clean_root(root) for root in roots]
+  # convert string to int index.
+  roots = [word_to_index[root] if root in word_to_index.keys() else 0 for root in roots]
+
+  return roots
+
+
+def pad_video(video_feature, dimension, padded_feature=None):
+  """Fills pad to video to have same length.
+  Pad in Left.
+  video = [pad,..., pad, frm1, frm2, ..., frmN]
+  """
+  if padded_feature is None:
+    padded_feature = np.zeros(dimension, dtype=np.float32)
+  max_length = dimension[0]
+  current_length = video_feature.shape[0]
+  num_padding = max_length - current_length
+  if num_padding == 0:
+    padded_feature[:] = video_feature
+  elif num_padding < 0:
+    steps = np.linspace(0, current_length, num=max_length, endpoint=False, dtype=np.int32)
+    padded_feature[:] = video_feature[steps]
+  else:
+    # about 0.7 sec
+    padded_feature[num_padding:] = video_feature
+
+  return padded_feature
+
+def repeat_pad_video(video_feature, dimension):
+  padded_feature = np.zeros(dimension, dtype= np.float)
+  max_length = dimension[0]
+  current_length = video_feature.shape[0]
+
+  if current_length == max_length:
+    padded_feature[:] = video_feature
+
+  elif current_length < max_length:
+    tile_num = int(max_length / current_length)
+    to_tile = np.ones(len(dimension), dtype=np.int32)
+    to_tile[0] = tile_num
+    remainder = max_length % current_length
+    tiled_vid = np.tile(video_feature, to_tile)
+    if remainder > 0:
+      padded_feature[0:remainder] = video_feature[-remainder:]
+    padded_feature[remainder:] = tiled_vid
+
+  else:
+    steps = np.linspace(0, current_length, num=max_length, endpoint=False, dtype=np.int32)
+    padded_feature[:] = video_feature[steps]
+  return padded_feature
+
+def stretch_pad_video(video_feature, dimension):
+  padded_feature = np.zeros(dimension, dtype= np.float)
+  max_length = dimension[0]
+  current_length = video_feature.shape[0]
+
+  if current_length == max_length:
+    padded_feature[:] = video_feature
+  elif current_length < max_length:
+    repeat_num = int((max_length-1) / current_length)+1
+    tiled_vid = np.repeat(video_feature, repeat_num,0)
+    steps = np.linspace(0, repeat_num*current_length, num=max_length, endpoint=False, dtype=np.int32)
+    padded_feature[:] = tiled_vid[steps]
+  else:
+    steps = np.linspace(0, current_length, num=max_length, endpoint=False, dtype=np.int32)
+    padded_feature[:] = video_feature[steps]
+  return padded_feature
+
+
+def fill_mask(max_length, current_length, zero_location='LEFT'):
+  num_padding = max_length - current_length
+  if num_padding <= 0:
+    mask = np.ones(max_length)
+  elif zero_location == 'LEFT':
+    mask = np.ones(max_length)
+    for i in range(num_padding):
+      mask[i] = 0
+  elif zero_location == 'RIGHT':
+    mask = np.zeros(max_length)
+    for i in range(current_length):
+      mask[i] = 1
+
+  return mask
diff --git a/models/jsfusion/model.py b/models/jsfusion/model.py
@@ -0,0 +1,121 @@
+from models.jsfusion.module import ResNetFeatureExtractor
+from models.jsfusion.module import MCModel
+from models.jsfusion.sampler import FixedSampler
+
+from runner_model import RunnerModel
+from video_path_provider import VideoPathIterator
+from itertools import cycle
+from torchvision import transforms
+import torch
+import nvvl
+import os
+
+class JsFusionVideoPathIterator(VideoPathIterator):
+  def __init__(self):
+    super(JsFusionVideoPathIterator, self).__init__()
+
+    videos = []
+    video_dir = os.path.join(os.environ['LSMDC_PATH'], 'mp4s')
+    for video in os.listdir(video_dir):
+      videos.append(os.path.join(video_dir, video))
+
+    if len(videos) <= 0:
+      raise Exception('No video available.')
+
+    self.videos_iter = cycle(videos)
+
+  def __iter__(self):
+    return self.videos_iter 
+
+class JsFusionLoader(RunnerModel):
+  """Impl of loading video frames using NVVL, for the R(2+1)D model."""
+  def __init__(self, device):
+    self.loader = nvvl.RnBLoader(width=224, height=224,
+                                 consecutive_frames=1, device_id=device.index,
+                                 sampler=FixedSampler(num_frames=40))
+
+    samples = [
+        os.path.join(os.environ['LSMDC_PATH'], 'mp4s/1004_Juno_00.00.32.849-00.00.35.458.mp4'),
+        os.path.join(os.environ['LSMDC_PATH'], 'mp4s/1004_Juno_00.00.35.642-00.00.45.231.mp4'),
+        os.path.join(os.environ['LSMDC_PATH'], 'mp4s/1004_Juno_00.00.49.801-00.00.59.450.mp4')]
+
+    # warm up GPU with a few inferences
+    for sample in samples:
+      self.loader.loadfile(sample)
+    for frames in self.loader:
+      pass
+    self.loader.flush()
+
+  def __call__(self, input):
+    _, file_path = input
+    self.loader.loadfile(file_path)
+    for frames in self.loader:
+      pass
+    self.loader.flush()
+
+
+    # frames: (40, 3, 1, 224, 224)
+    frames = frames.float()
+    frames = frames.permute(0, 2, 1, 3, 4)
+
+    transform = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                              std=[0.229, 0.224, 0.225])
+    frames_tmp = []
+    for frame in frames:
+      frame = torch.squeeze(frame)
+      frame /= 255
+      frame = transform(frame)
+      frames_tmp.append(frame)
+    frames = torch.stack(frames_tmp)
+    # frames: (40, 3, 224, 224)
+
+    filename = os.path.basename(file_path)
+    out = (frames, filename)
+    return out
+
+  def __del__(self):
+    self.loader.close()
+
+  def input_shape(self):
+    return None
+
+  @staticmethod
+  def output_shape():
+    return ((40, 3, 224, 224),)
+
+
+class ResNetRunner(RunnerModel):
+  def __init__(self, device, num_frames = 40):
+    super(ResNetRunner, self).__init__(device)
+    self.model = ResNetFeatureExtractor(num_frames).to(device)
+    self.model.float()
+    self.model.eval()
+
+  def input_shape(self):
+    return ((40, 3, 224, 224),)
+
+  @staticmethod
+  def output_shape():
+    return ((1, 40, 2048),)
+
+  def __call__(self, input):
+    return self.model(input)
+
+
+class MCModelRunner(RunnerModel):
+  def __init__(self, device, num_frames = 40):
+    super(MCModelRunner, self).__init__(device)
+    self.model = MCModel(device).to(device)
+    self.model.float()
+    self.model.eval()
+
+  def input_shape(self):
+    return ((1, 40, 2048),)
+
+  def __call__(self, input):
+    return self.model(input)
+
+  @staticmethod
+  def output_shape():
+    return ((1,),)
+