@@ -44,10 +44,11 @@ and supports:
44
44
conda install pytorch=1.11.0 torchvision=0.12.0 cudatoolkit=11.3 -c pytorch
45
45
```
46
46
47
- - Install pytorchvideo from main branch:
47
+ - Install pytorchvideo and transformers from main branch:
48
48
49
49
``` bash
50
50
pip install git+https://github.com/facebookresearch/pytorchvideo.git
51
+ pip install git+https://github.com/huggingface/transformers.git
51
52
```
52
53
53
54
- Install ` video-transformers ` :
@@ -83,7 +84,48 @@ val_root
83
84
...
84
85
```
85
86
86
- - Fine-tune CVT (from HuggingFace) + Transformer based video classifier:
87
+ - Fine-tune Timesformer (from HuggingFace) video classifier:
88
+
89
+ ``` python
90
+ from torch.optim import AdamW
91
+ from video_transformers import VideoModel
92
+ from video_transformers.backbones.transformers import TransformersBackbone
93
+ from video_transformers.data import VideoDataModule
94
+ from video_transformers.heads import LinearHead
95
+ from video_transformers.trainer import trainer_factory
96
+ from video_transformers.utils.file import download_ucf6
97
+
98
+ backbone = TransformersBackbone(" facebook/timesformer-base-finetuned-k400" , num_unfrozen_stages = 1 )
99
+
100
+ download_ucf6(" ./" )
101
+ datamodule = VideoDataModule(
102
+ train_root = " ucf6/train" ,
103
+ val_root = " ucf6/val" ,
104
+ batch_size = 4 ,
105
+ num_workers = 4 ,
106
+ num_timesteps = 8 ,
107
+ preprocess_input_size = 224 ,
108
+ preprocess_clip_duration = 1 ,
109
+ preprocess_means = backbone.mean,
110
+ preprocess_stds = backbone.std,
111
+ preprocess_min_short_side = 256 ,
112
+ preprocess_max_short_side = 320 ,
113
+ preprocess_horizontal_flip_p = 0.5 ,
114
+ )
115
+
116
+ head = LinearHead(hidden_size = backbone.num_features, num_classes = datamodule.num_classes)
117
+ model = VideoModel(backbone, head)
118
+
119
+ optimizer = AdamW(model.parameters(), lr = 1e-4 )
120
+
121
+ Trainer = trainer_factory(" single_label_classification" )
122
+ trainer = Trainer(datamodule, model, optimizer = optimizer, max_epochs = 8 )
123
+
124
+ trainer.fit()
125
+
126
+ ```
127
+
128
+ - Fine-tune ConvNeXT (from HuggingFace) + Transformer based video classifier:
87
129
88
130
``` python
89
131
from torch.optim import AdamW
@@ -95,7 +137,7 @@ from video_transformers.necks import TransformerNeck
95
137
from video_transformers.trainer import trainer_factory
96
138
from video_transformers.utils.file import download_ucf6
97
139
98
- backbone = TimeDistributed(TransformersBackbone(" microsoft/cvt-13 " , num_unfrozen_stages = 0 ))
140
+ backbone = TimeDistributed(TransformersBackbone(" facebook/convnext-small-224 " , num_unfrozen_stages = 1 ))
99
141
neck = TransformerNeck(
100
142
num_features = backbone.num_features,
101
143
num_timesteps = 8 ,
@@ -137,18 +179,18 @@ trainer.fit()
137
179
138
180
```
139
181
140
- - Fine-tune MobileViT (from Timm ) + GRU based video classifier:
182
+ - Fine-tune Resnet18 (from HuggingFace ) + GRU based video classifier:
141
183
142
184
``` python
143
185
from video_transformers import TimeDistributed, VideoModel
144
- from video_transformers.backbones.timm import TimmBackbone
186
+ from video_transformers.backbones.transformers import TransformersBackbone
145
187
from video_transformers.data import VideoDataModule
146
188
from video_transformers.heads import LinearHead
147
189
from video_transformers.necks import GRUNeck
148
190
from video_transformers.trainer import trainer_factory
149
191
from video_transformers.utils.file import download_ucf6
150
192
151
- backbone = TimeDistributed(TimmBackbone( " mobilevitv2_100 " , num_unfrozen_stages = 0 ))
193
+ backbone = TimeDistributed(TransformersBackbone( " microsoft/resnet-18 " , num_unfrozen_stages = 1 ))
152
194
neck = GRUNeck(num_features = backbone.num_features, hidden_size = 128 , num_layers = 2 , return_last = True )
153
195
154
196
download_ucf6(" ./" )
@@ -188,7 +230,7 @@ from video_transformers import VideoModel
188
230
189
231
model = VideoModel.from_pretrained(model_name_or_path)
190
232
191
- model.predict(video_path = " video.mp4" )
233
+ model.predict(video_or_folder_path = " video.mp4" )
192
234
>> [{' filename' : " video.mp4" , ' predictions' : {' class1' : 0.98 , ' class2' : 0.02 }}]
193
235
```
194
236
@@ -277,3 +319,20 @@ from video_transformers import VideoModel
277
319
model = VideoModel.from_pretrained(" runs/exp/checkpoint" )
278
320
model.to_gradio(examples = [' video.mp4' ], export_dir = " runs/exports/" , export_filename = " app.py" )
279
321
```
322
+
323
+
324
+ ## Contributing
325
+
326
+ Before opening a PR:
327
+
328
+ - Install required development packages:
329
+
330
+ ``` bash
331
+ pip install -e ." [dev]"
332
+ ```
333
+
334
+ - Reformat with black and isort:
335
+
336
+ ``` bash
337
+ python -m tests.run_code_style format
338
+ ```
0 commit comments