-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreferences.bib
531 lines (476 loc) · 20.5 KB
/
references.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
@article{LeNet,
author={Lecun, Y. and Bottou, L. and Bengio, Y. and Haffner, P.},
journal={Proceedings of the IEEE},
title={Gradient-based learning applied to document recognition},
year={1998},
volume={86},
number={11},
pages={2278-2324},
keywords={Neural networks;Pattern recognition;Machine learning;Optical character recognition software;Character recognition;Feature extraction;Multi-layer neural network;Optical computing;Hidden Markov models;Principal component analysis},
doi={10.1109/5.726791}}
@article{AlexNet,
author = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E.},
title = {ImageNet classification with deep convolutional neural networks},
year = {2017},
issue_date = {June 2017},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {60},
number = {6},
issn = {0001-0782},
url = {https://doi.org/10.1145/3065386},
doi = {10.1145/3065386},
abstract = {We trained a large, deep convolutional neural network to classify the 1.2 million high-resolution images in the ImageNet LSVRC-2010 contest into the 1000 different classes. On the test data, we achieved top-1 and top-5 error rates of 37.5\% and 17.0\%, respectively, which is considerably better than the previous state-of-the-art. The neural network, which has 60 million parameters and 650,000 neurons, consists of five convolutional layers, some of which are followed by max-pooling layers, and three fully connected layers with a final 1000-way softmax. To make training faster, we used non-saturating neurons and a very efficient GPU implementation of the convolution operation. To reduce overfitting in the fully connected layers we employed a recently developed regularization method called "dropout" that proved to be very effective. We also entered a variant of this model in the ILSVRC-2012 competition and achieved a winning top-5 test error rate of 15.3\%, compared to 26.2\% achieved by the second-best entry.},
journal = {Commun. ACM},
month = {may},
pages = {84–90},
numpages = {7}
}
@article{ILSVRC,
Author = {Olga Russakovsky and Jia Deng and Hao Su and Jonathan Krause and Sanjeev Satheesh and Sean Ma and Zhiheng Huang and Andrej Karpathy and Aditya Khosla and Michael Bernstein and Alexander C. Berg and Li Fei-Fei},
Title = {{ImageNet Large Scale Visual Recognition Challenge}},
Year = {2015},
journal = {International Journal of Computer Vision (IJCV)},
doi = {10.1007/s11263-015-0816-y},
volume={115},
number={3},
pages={211-252}
}
@INPROCEEDINGS{KTH,
author={Schuldt, C. and Laptev, I. and Caputo, B.},
booktitle={Proceedings of the 17th International Conference on Pattern Recognition, 2004. ICPR 2004.},
title={Recognizing human actions: a local SVM approach},
year={2004},
volume={3},
number={},
pages={32-36 Vol.3},
keywords={Humans;Support vector machines;Computer vision;Pattern recognition;Support vector machine classification;Cameras;Frequency;Spatial databases;Performance evaluation;Image recognition},
doi={10.1109/ICPR.2004.1334462}
}
@INPROCEEDINGS{Weizmann,
author={Blank, M. and Gorelick, L. and Shechtman, E. and Irani, M. and Basri, R.},
booktitle={Tenth IEEE International Conference on Computer Vision (ICCV'05) Volume 1},
title={Actions as space-time shapes},
year={2005},
volume={2},
number={},
pages={1395-1402 Vol. 2},
keywords={Shape;Humans;Video sequences;Torso;Poisson equations;Computer vision;Information analysis;Optical computing;Image motion analysis;Motion analysis},
doi={10.1109/ICCV.2005.28}
}
@INPROCEEDINGS{IXMAS,
author={Weinland, Daniel and Boyer, Edmond and Ronfard, Remi},
booktitle={2007 IEEE 11th International Conference on Computer Vision},
title={Action Recognition from Arbitrary Views using 3D Exemplars},
year={2007},
volume={},
number={},
pages={1-7},
keywords={Cameras;Hidden Markov models;Humans;Image recognition;Image reconstruction;Solid modeling;Layout;Parametric statistics;Kinematics;Image motion analysis},
doi={10.1109/ICCV.2007.4408849}
}
@INPROCEEDINGS{HollyWood,
author={Laptev, Ivan and Marszalek, Marcin and Schmid, Cordelia and Rozenfeld, Benjamin},
booktitle={2008 IEEE Conference on Computer Vision and Pattern Recognition},
title={Learning realistic human actions from movies},
year={2008},
volume={},
number={},
pages={1-8},
keywords={Humans;Motion pictures;Image recognition;Video sharing;Layout;Text categorization;Object recognition;Robustness;Clothing;Cameras},
doi={10.1109/CVPR.2008.4587756}
}
@INPROCEEDINGS{UCFSports,
author={Rodriguez, Mikel D. and Ahmed, Javed and Shah, Mubarak},
booktitle={2008 IEEE Conference on Computer Vision and Pattern Recognition},
title={Action MACH a spatio-temporal Maximum Average Correlation Height filter for action recognition},
year={2008},
volume={},
number={},
pages={1-8},
keywords={Humans;Image motion analysis;Data analysis;Fourier transforms;Optical films;Optical filters;Computer vision;Spatiotemporal phenomena;Frequency domain analysis;Computational efficiency},
doi={10.1109/CVPR.2008.4587727}
}
@INPROCEEDINGS{Hollywood2,
author={Marszalek, Marcin and Laptev, Ivan and Schmid, Cordelia},
booktitle={2009 IEEE Conference on Computer Vision and Pattern Recognition},
title={Actions in context},
year={2009},
volume={},
number={},
pages={2929-2936},
keywords={Layout;Motion pictures;Humans;Roads;Surveillance;Text mining;Testing;Support vector machines;Support vector machine classification;Scalability},
doi={10.1109/CVPR.2009.5206557}
}
@INPROCEEDINGS{UCFYouTube,
author={Liu, Jingen and Jiebo Luo and Shah, Mubarak},
booktitle={2009 IEEE Conference on Computer Vision and Pattern Recognition},
title={Recognizing realistic actions from videos “in the wild”},
year={2009},
volume={},
number={},
pages={1996-2003},
keywords={Videos;Cameras;YouTube;Humans;Feature extraction;Motion pictures;Shape;Spatiotemporal phenomena;Computer vision;Vocabulary},
doi={10.1109/CVPR.2009.5206744}
}
@InProceedings{Olympic,
author="Niebles, Juan Carlos
and Chen, Chih-Wei
and Fei-Fei, Li",
editor="Daniilidis, Kostas
and Maragos, Petros
and Paragios, Nikos",
title="Modeling Temporal Structure of Decomposable Motion Segments for Activity Classification",
booktitle="Computer Vision -- ECCV 2010",
year="2010",
publisher="Springer Berlin Heidelberg",
address="Berlin, Heidelberg",
pages="392--405",
abstract="Much recent research in human activity recognition has focused on the problem of recognizing simple repetitive (walking, running, waving) and punctual actions (sitting up, opening a door, hugging). However, many interesting human activities are characterized by a complex temporal composition of simple actions. Automatic recognition of such complex actions can benefit from a good understanding of the temporal structures. We present in this paper a framework for modeling motion by exploiting the temporal structure of the human activities. In our framework, we represent activities as temporal compositions of motion segments. We train a discriminative model that encodes a temporal decomposition of video sequences, and appearance models for each motion segment. In recognition, a query video is matched to the model according to the learned appearances and motion segment decomposition. Classification is made based on the quality of matching between the motion segment classifiers and the temporal segments in the query sequence. To validate our approach, we introduce a new dataset of complex Olympic Sports activities. We show that our algorithm performs better than other state of the art methods.",
isbn="978-3-642-15552-9"
}
@INPROCEEDINGS{HMDB51,
author={Kuehne, H. and Jhuang, H. and Garrote, E. and Poggio, T. and Serre, T.},
booktitle={2011 International Conference on Computer Vision},
title={HMDB: A large video database for human motion recognition},
year={2011},
volume={},
number={},
pages={2556-2563},
keywords={Cameras;YouTube;Databases;Training;Visualization;Humans;Motion pictures},
doi={10.1109/ICCV.2011.6126543}
}
@misc{UCF101,
title={UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild},
author={Khurram Soomro and Amir Roshan Zamir and Mubarak Shah},
year={2012},
eprint={1212.0402},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@article{PASCALVOC,
author = {Everingham, Mark and Van Gool, Luc and Williams, Christopher and Winn, John and Zisserman, Andrew},
year = {2010},
month = {06},
pages = {303-338},
title = {The Pascal Visual Object Classes (VOC) challenge},
volume = {88},
journal = {International Journal of Computer Vision},
doi = {10.1007/s11263-009-0275-4}
}
@misc{faster-rcnn,
title={Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks},
author={Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun},
year={2016},
eprint={1506.01497},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{personembedding,
title={PersonNet: Person Re-identification with Deep Convolutional Neural Networks},
author={Lin Wu and Chunhua Shen and Anton van den Hengel},
year={2016},
eprint={1601.07255},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@article{TheHungarian,
title={The Hungarian method for the assignment problem},
author={Harold W. Kuhn},
journal={Naval Research Logistics (NRL)},
year={1955},
volume={52},
url={https://api.semanticscholar.org/CorpusID:9426884}
}
@misc{AVA,
title={AVA: A Video Dataset of Spatio-temporally Localized Atomic Visual Actions},
author={Chunhui Gu and Chen Sun and David A. Ross and Carl Vondrick and Caroline Pantofaru and Yeqing Li and Sudheendra Vijayanarasimhan and George Toderici and Susanna Ricco and Rahul Sukthankar and Cordelia Schmid and Jitendra Malik},
year={2018},
eprint={1705.08421},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@inproceedings{Sports1M,
title = {Large-scale Video Classification with Convolutional Neural Networks},
author = {Andrej Karpathy and George Toderici and Sanketh Shetty and Thomas Leung and Rahul Sukthankar and Li Fei-Fei},
year = {2014},
booktitle = {CVPR}
}
@misc{YouTube8M,
title={YouTube-8M: A Large-Scale Video Classification Benchmark},
author={Sami Abu-El-Haija and Nisarg Kothari and Joonseok Lee and Paul Natsev and George Toderici and Balakrishnan Varadarajan and Sudheendra Vijayanarasimhan},
year={2016},
eprint={1609.08675},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{SomethingSomething,
title={The "something something" video database for learning and evaluating visual common sense},
author={Raghav Goyal and Samira Ebrahimi Kahou and Vincent Michalski and Joanna Materzyńska and Susanne Westphal and Heuna Kim and Valentin Haenel and Ingo Fruend and Peter Yianilos and Moritz Mueller-Freitag and Florian Hoppe and Christian Thurau and Ingo Bax and Roland Memisevic},
year={2017},
eprint={1706.04261},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{MomentsinTime,
title={Moments in Time Dataset: one million videos for event understanding},
author={Mathew Monfort and Alex Andonian and Bolei Zhou and Kandan Ramakrishnan and Sarah Adel Bargal and Tom Yan and Lisa Brown and Quanfu Fan and Dan Gutfruend and Carl Vondrick and Aude Oliva},
year={2019},
eprint={1801.03150},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@INPROCEEDINGS{ActivityNet,
author={Heilbron, Fabian Caba and Escorcia, Victor and Ghanem, Bernard and Niebles, Juan Carlos},
booktitle={2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
title={ActivityNet: A large-scale video benchmark for human activity understanding},
year={2015},
volume={},
number={},
pages={961-970},
keywords={Benchmark testing;Taxonomy;Cleaning;Semantics;Organizations;Complexity theory;YouTube},
doi={10.1109/CVPR.2015.7298698}
}
@article{THUMOS,
title={The THUMOS challenge on action recognition for videos “in the wild”},
volume={155},
ISSN={1077-3142},
url={http://dx.doi.org/10.1016/j.cviu.2016.10.018},
DOI={10.1016/j.cviu.2016.10.018},
journal={Computer Vision and Image Understanding},
publisher={Elsevier BV},
author={Idrees, Haroon and Zamir, Amir R. and Jiang, Yu-Gang and Gorban, Alex and Laptev, Ivan and Sukthankar, Rahul and Shah, Mubarak},
year={2017},
month=feb, pages={1–23}
}
@article{Multi-THUMOS,
title={Every Moment Counts: Dense Detailed Labeling of Actions in Complex Videos},
author={Yeung, Serena and Russakovsky, Olga and Jin, Ning and Andriluka, Mykhaylo and Mori, Greg and Fei-Fei, Li},
journal={International Journal of Computer Vision},
year={2017}
}
@misc{Charades,
title={Hollywood in Homes: Crowdsourcing Data Collection for Activity Understanding},
author={Gunnar A. Sigurdsson and Gül Varol and Xiaolong Wang and Ali Farhadi and Ivan Laptev and Abhinav Gupta},
year={2016},
eprint={1604.01753},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{HACS,
title={HACS: Human Action Clips and Segments Dataset for Recognition and Temporal Localization},
author={Hang Zhao and Antonio Torralba and Lorenzo Torresani and Zhicheng Yan},
year={2019},
eprint={1712.09374},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{HVU,
title={Large Scale Holistic Video Understanding},
author={Ali Diba and Mohsen Fayyaz and Vivek Sharma and Manohar Paluri and Jurgen Gall and Rainer Stiefelhagen and Luc Van Gool},
year={2020},
eprint={1904.11451},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{NTURGB+D,
title={NTU RGB+D: A Large Scale Dataset for 3D Human Activity Analysis},
author={Amir Shahroudy and Jun Liu and Tian-Tsong Ng and Gang Wang},
year={2016},
eprint={1604.02808},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{AViD,
title={AViD Dataset: Anonymized Videos from Diverse Countries},
author={AJ Piergiovanni and Michael S. Ryoo},
year={2020},
eprint={2007.05515},
}
@article{FineAction,
title={Fineaction: A fine-grained video dataset for temporal action localization},
author={Liu, Yi and Wang, Limin and Wang, Yali and Ma, Xiao and Qiao, Yu},
journal={IEEE Transactions on Image Processing},
year={2022},
publisher={IEEE}
}
@misc{EPICKITCHENS100,
doi = {10.5523/BRIS.2G1N6QDYDWA9U22SHPXQZP0T8M},
url = {https://data.bris.ac.uk/data/dataset/2g1n6qdydwa9u22shpxqzp0t8m/},
author = {Aldamen, Dima and Moltisanti, Davide and Kazakos, Evangelos and Doughty, Hazel and Munro, Jonathan and Price, William and Wray, Michael and Perrett, Tobias and Ma, Jian},
language = {en},
title = {EPIC-KITCHENS-100},
publisher = {University of Bristol},
year = {2020}
}
@inproceedings{ToyotaSmarthome,
title={Toyota smarthome: Real-world activities of daily living},
author={Das, Srijan and Dai, Rui and Koperski, Michal and Minciullo, Luca and Garattoni, Lorenzo and Bremond, Francois and Francesca, Gianpiero},
booktitle={Proceedings of the IEEE/CVF international conference on computer vision},
pages={833--842},
year={2019}
}
@article{Kinetics-400,
title={The kinetics human action video dataset},
author={Kay, Will and Carreira, Joao and Simonyan, Karen and Zhang, Brian and Hillier, Chloe and Vijayanarasimhan, Sudheendra and Viola, Fabio and Green, Tim and Back, Trevor and Natsev, Paul and others},
journal={arXiv preprint arXiv:1705.06950},
year={2017}
}
@article{Kinetics-600,
title={A short note about kinetics-600},
author={Carreira, Joao and Noland, Eric and Banki-Horvath, Andras and Hillier, Chloe and Zisserman, Andrew},
journal={arXiv preprint arXiv:1808.01340},
year={2018}
}
@article{Kinetics-700,
title={A short note on the kinetics-700 human action dataset},
author={Carreira, Joao and Noland, Eric and Hillier, Chloe and Zisserman, Andrew},
journal={arXiv preprint arXiv:1907.06987},
year={2019}
}
@article{Kinetics-700-2020,
title={A short note on the kinetics-700-2020 human action dataset},
author={Smaira, Lucas and Carreira, Jo{\~a}o and Noland, Eric and Clancy, Ellen and Wu, Amy and Zisserman, Andrew},
journal={arXiv preprint arXiv:2010.10864},
year={2020}
}
@misc{VideoMAEV2g,
title={VideoMAE V2: Scaling Video Masked Autoencoders with Dual Masking},
author={Limin Wang and Bingkun Huang and Zhiyu Zhao and Zhan Tong and Yinan He and Yi Wang and Yali Wang and Yu Qiao},
year={2023},
eprint={2303.16727},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{ip-CSN-152,
title={Video Classification with Channel-Separated Convolutional Networks},
author={Du Tran and Heng Wang and Lorenzo Torresani and Matt Feiszli},
year={2019},
eprint={1904.02811},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{TriDet,
title={Temporal Action Localization with Enhanced Instant Discriminability},
author={Dingfeng Shi and Qiong Cao and Yujie Zhong and Shan An and Jian Cheng and Haogang Zhu and Dacheng Tao},
year={2023},
eprint={2309.05590},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{AdaTAD,
title={End-to-End Temporal Action Detection with 1B Parameters Across 1000 Frames},
author={Shuming Liu and Chen-Lin Zhang and Chen Zhao and Bernard Ghanem},
year={2023},
eprint={2311.17241},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{Text4Vis,
title={Revisiting Classifier: Transferring Vision-Language Models for Video Recognition},
author={Wenhao Wu and Zhun Sun and Wanli Ouyang},
year={2023},
eprint={2207.01297},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{TTM,
title={Token Turing Machines},
author={Michael S. Ryoo and Keerthana Gopalakrishnan and Kumara Kahatapitiya and Ted Xiao and Kanishka Rao and Austin Stone and Yao Lu and Julian Ibarz and Anurag Arnab},
year={2023},
eprint={2211.09119},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{TokenLearner,
title={TokenLearner: What Can 8 Learned Tokens Do for Images and Videos?},
author={Michael S. Ryoo and AJ Piergiovanni and Anurag Arnab and Mostafa Dehghani and Anelia Angelova},
year={2022},
eprint={2106.11297},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{InternVideo,
title={InternVideo: General Video Foundation Models via Generative and Discriminative Learning},
author={Yi Wang and Kunchang Li and Yizhuo Li and Yinan He and Bingkun Huang and Zhiyu Zhao and Hongjie Zhang and Jilan Xu and Yi Liu and Zun Wang and Sen Xing and Guo Chen and Junting Pan and Jiashuo Yu and Yali Wang and Limin Wang and Yu Qiao},
year={2022},
eprint={2212.03191},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{TubeVit-H,
title={Rethinking Video ViTs: Sparse Video Tubes for Joint Image and Video Learning},
author={AJ Piergiovanni and Weicheng Kuo and Anelia Angelova},
year={2022},
eprint={2212.03229},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@inbook{DCGN,
title={Hierarchical Video Frame Sequence Representation with Deep Convolutional Graph Network},
ISBN={9783030110185},
ISSN={1611-3349},
url={http://dx.doi.org/10.1007/978-3-030-11018-5_24},
DOI={10.1007/978-3-030-11018-5_24},
booktitle={Computer Vision – ECCV 2018 Workshops},
publisher={Springer International Publishing},
author={Mao, Feng and Wu, Xiang and Xue, Hui and Zhang, Rong},
year={2019},
pages={262–270}
}
@misc{MVD,
title={Masked Video Distillation: Rethinking Masked Feature Modeling for Self-supervised Video Representation Learning},
author={Rui Wang and Dongdong Chen and Zuxuan Wu and Yinpeng Chen and Xiyang Dai and Mengchen Liu and Lu Yuan and Yu-Gang Jiang},
year={2023},
eprint={2212.04500},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{LART,
title={On the Benefits of 3D Pose and Tracking for Human Action Recognition},
author={Jathushan Rajasegaran and Georgios Pavlakos and Angjoo Kanazawa and Christoph Feichtenhofer and Jitendra Malik},
year={2023},
eprint={2304.01199},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{UMT-L,
title={OmniVec: Learning robust representations with cross modal sharing},
author={Siddharth Srivastava and Gaurav Sharma},
year={2023},
eprint={2311.05709},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{UniFormerV2-L,
title={UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer},
author={Kunchang Li and Yali Wang and Yinan He and Yizhuo Li and Yi Wang and Limin Wang and Yu Qiao},
year={2022},
eprint={2211.09552},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{MS-TCT,
title={MS-TCT: Multi-Scale Temporal ConvTransformer for Action Detection},
author={Rui Dai and Srijan Das and Kumara Kahatapitiya and Michael S. Ryoo and Francois Bremond},
year={2021},
eprint={2112.03902},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{PAAT,
title={Seeing the Pose in the Pixels: Learning Pose-Aware Representations in Vision Transformers},
author={Dominick Reilly and Aman Chadha and Srijan Das},
year={2023},
eprint={2306.09331},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{Avion,
title={Training a Large Video Model on a Single Machine in a Day},
author={Yue Zhao and Philipp Krähenbühl},
year={2023},
eprint={2309.16669},
archivePrefix={arXiv},
primaryClass={cs.CV}
}