-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
220 lines (176 loc) · 8.85 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import torch
import torch.nn as nn
import torchvision.models
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence
MODEL_LIST = ["resnet18"]
class CNN(nn.Module):
"""
Class which implements backbone cnn model
Arguments:
- model_name (str): cnn model architecture (currently only resnet18 is available)
- num_class (int): It should be None, when cnn is used as the backbone network for either LRCN or ODCNN
- dropout (float): a probability for the last dropout layer
- pretrained (bool): It specifies whether to use the pretrained model or nor
"""
def __init__(self, model_name, num_class=None, dropout = 0.0, pretrained = True):
super(CNN, self).__init__()
self.model_name = model_name
if model_name == "resnet18":
input_size = (3, 224, 224)
ft_extractor = torchvision.models.resnet18(pretrained=pretrained) #spatial feature extraction layers of CNN
ft_size = ft_extractor.fc.in_features
ft_extractor.fc = nn.Identity() #Nullify the last linear layer
else:
raise Exception(("The model '%s' is not supported. Choose the model in" % model_name) + ','.join( model for model in MODEL_LIST ))
self.input_size = input_size
self.ft_extractor = ft_extractor
self.ft_size = ft_size
if dropout > 0.0:
self.dropout = nn.Dropout(p=dropout) # The last dropout layer
else:
self.dropout = nn.Identity()
self.output_size = num_class if num_class is not None else ft_size
if num_class is not None:
# When num_class is not None, this cnn model is used as a stand-alone model to classify each frame separately
self.classifier = nn.Linear(ft_size, num_class) # Contstruct the last linear layer
self.ex_ft_mode = False # It is not just the feature extraction model
else:
# This cnn model is used as a backbone network for LRCN or ODCNN
self.classifier = None
self.ex_ft_mode = True # It is just the feature extraction model
print(self.model_name, "used")
def forward(self, inputs): #inputs: T(B, input_size) --> outputs: T(B, output_size)
#Return the final output of the model, if ex_ft_model is True, then this method is equivalent to the method extract_feature()
outputs = self.ft_extractor(inputs)
outputs = self.dropout(outputs)
if not self.ex_ft_mode:
outputs = self.classifier( outputs )
return outputs
def extract_feature(self, inputs): #inputs: T(B, input_size) --> outputs: T(B, ft_size)
#Return only spatial features
return self.dropout( self.ft_extractor(inputs) )
def extract_feature_mode(self, bool_):
# Chagne the feature extraction model flag to bool_ and return the flag that the model previously had
org_mode = self.ex_ft_mode
if self.classifier is not None:
self.ex_ft_mode = bool_
return org_mode
class TempCNN(nn.Module):
"""
Temporal 1-dimensional CNN kernel for the model, ODCNN
Arguments:
- in_feature_size (int)
- out_feature_size (int)
- kernel_size (int): 1D kernel size
"""
def __init__(self, in_feature_size, out_feature_size, kernel_size):
super(TempCNN, self).__init__()
self.in_ftrs = in_feature_size
self.out_ftrs = out_feature_size
self.ks = kernel_size
self.tconv = nn.Conv1d(in_channels = in_feature_size, out_channels = out_feature_size, kernel_size = kernel_size) #1D convolutional layer
self.relu = nn.ReLU(True)
def forward(self, inputs): #inputs T(B, in_feature_size, n_frame)
outputs = self.relu( self.tconv(inputs) ) #T(B, out_channels, n_frame - kernel_size + 1)
outputs, _ = torch.max( outputs, dim = -1 ) #T(B, out_channels) , max pooling over time
return outputs
class LRCN(nn.Module):
"""
LRCN model
Arguments:
- hidden_size (int): the hidden dimension for LSTM
- action_size (int): the number of action categories
- seq_len (int): unit video clip length
- cnn_model (str): model architecture for a backbone cnn model
- ft_size (int): the dimension of spatial features generated from the backbone network
- lstm_dropout (float): a probability for the dropout layer which is applied to the output of LSTM
- cnn_dropout (float): a probability for the last dropout layer of the backbone cnn model
"""
def __init__(self, hidden_size, action_size, seq_len = 16, cnn_model=None, ft_size = None, lstm_dropout=0.0, cnn_dropout=0.0):
super(LRCN, self).__init__()
self.action_size = action_size
self.seq_len = seq_len
self.hidden_size = hidden_size
cnn = None
input_size = -1
if cnn_model is not None:
cnn = CNN(cnn_model, dropout = cnn_dropout) # Construct the backbone CNN model
else:
raise Exception("CNN model should be specified")
ft_size = cnn.ft_size
input_size = cnn.input_size
self.cnn = cnn
self.lstm = nn.LSTM( input_size=ft_size, hidden_size=hidden_size, batch_first=True )
if lstm_dropout > 0.0:
self.dropout = nn.Dropout( p = lstm_dropout )
else:
self.dropout = nn.Identity()
self.linear_out = nn.Linear( hidden_size, action_size, bias=True )
self.softmax = nn.Softmax( dim=2 )
self.input_size = input_size
self.ft_size = ft_size
def forward(self, inputs):# inputs: T(B, seq_len, input_size) --> outputs: T(B, seq_len, Action)
'''
Take video clips and return logits for each frame in video clips
'''
inputs = inputs.view(-1, *self.cnn.input_size)
inputs = self.cnn.extract_feature(inputs) # Extract spatial feature
inputs = inputs.view(-1, self.seq_len, self.ft_size)
return self.forward_lstm( inputs ) # Process through LSTM
def forward_lstm(self, inputs):# inputs: T(B, seq_len, cnn_output_size) --> outputs: T(B, seq_len, Action)
'''
Take spatial features and return logits for each frame in video clips
'''
self.lstm.flatten_parameters()
lstm_outputs, _ = self.lstm( inputs )
lstm_outputs = self.dropout( lstm_outputs )
outputs = self.linear_out( lstm_outputs )
return outputs
class ODCNN(nn.Module):
"""
1dcnn model
Arguments:
- hidden_size (int): the hidden dimension for temporal features
- action_size (int): the number of action categories
- seq_len (int): unit video clip length
- cnn_model (str): model architecture for a backbone cnn model
- ft_size (int): the dimension of spatial features generated from the backbone network
- dropout2 (float): a probability for the dropout layer which is applied to the output of a 1d cnn
- dropout1 (float): a probability for the last dropout layer of the backbone cnn model
- t_kernel_size (int): a temporal kernel size
"""
def __init__(self, hidden_size, action_size, seq_len = 16, cnn_model=None, dropout2=0.0, dropout1=0.0, t_kernel_size = 4):
super(ODCNN, self).__init__()
self.action_size = action_size
self.seq_len = seq_len
self.hidden_size = hidden_size
self.temp_kernel = t_kernel_size
cnn = None
input_size = -1
if cnn_model is not None:
cnn = CNN(cnn_model, dropout = dropout1) # Construct the backbone CNN model
else:
raise Exception("CNN model should be specified")
ft_size = cnn.ft_size
input_size = cnn.input_size
self.cnn = cnn
self.tcnn = TempCNN(ft_size, hidden_size, t_kernel_size) # Construct the temporal convolutional layer
self.dropout = nn.Dropout( p = dropout2 ) if dropout2 > 0.0 else nn.Identity()
self.linear_out = nn.Linear( hidden_size, action_size, bias=True )
self.input_size = input_size
self.ft_size = ft_size
def forward(self, inputs):# inputs: T(B, seq_len, input_size) --> outputs: T(B, seq_len, Action)
'''
Take video clips and return logits for each frame in video clips
'''
inputs = inputs.view(-1, *self.cnn.input_size) #T(B * seq_len, *IMG_SIZE)
inputs = self.cnn.extract_feature(inputs) #T(B * seq_len, ft_size), Exctract spatial features
return self.process_feature(inputs) # Process through the temporal kernel
def process_feature(self, inputs):# inputs: T(B, seq_len, cnn_output_size) --> outputs: T(B, Action)
'''
Take spatial features and return logits for each frame in video clips
'''
inputs = torch.transpose( inputs.view(-1, self.seq_len, self.ft_size), 1, 2 ) #T(B, cnn_output_size, seq_len)
inputs = self.tcnn(inputs) #T(B, hidden_size)
return self.linear_out( self.dropout( inputs ) ) #T(B, action_size)