Skip to content

Commit 8b85f91

Browse files
authored
refactor video data loaders, fix some bugs (#22)
* ignore mp4 avi zip files * update dependency versions * ignore onnx files * increase package version * fix a typo * refacator dataset loading * update code snippets in readme * reformat with isort * update workflows * ignore export and examples folders * clean code
1 parent 84e8e7d commit 8b85f91

File tree

13 files changed

+405
-128
lines changed

13 files changed

+405
-128
lines changed

.github/workflows/ci.yml

+3
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@ jobs:
6464
if: matrix.operating-system == 'macos-latest'
6565
run: pip install torch==${{ matrix.torch-version }}
6666

67+
- name: Install Pytorchvideo from main branch
68+
run: pip install git+https://github.com/facebookresearch/pytorchvideo.git
69+
6770
- name: Lint with flake8, black and isort
6871
run: |
6972
pip install .[dev]

.github/workflows/package_testing.yml

+3
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ jobs:
6363
if: matrix.operating-system == 'macos-latest'
6464
run: pip install torch==${{ matrix.torch-version }}
6565

66+
- name: Install Pytorchvideo from main branch
67+
run: pip install git+https://github.com/facebookresearch/pytorchvideo.git
68+
6669
- name: Install latest video-transformers package
6770
run: >
6871
pip install --upgrade --force-reinstall video-transformers[test]

.gitignore

+7-1
Original file line numberDiff line numberDiff line change
@@ -131,4 +131,10 @@ dmypy.json
131131
# extra
132132
.vscode
133133
.neptune
134-
runs/
134+
runs/
135+
*.mp4
136+
*.avi
137+
*.zip
138+
*.onnx
139+
exports/
140+
examples/

README.md

+20-12
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,12 @@ and supports:
4444
conda install pytorch=1.11.0 torchvision=0.12.0 cudatoolkit=11.3 -c pytorch
4545
```
4646

47+
- Install pytorchvideo from main branch:
48+
49+
```bash
50+
pip install git+https://github.com/facebookresearch/pytorchvideo.git
51+
```
52+
4753
- Install `video-transformers`:
4854

4955
```bash
@@ -87,6 +93,7 @@ from video_transformers.data import VideoDataModule
8793
from video_transformers.heads import LinearHead
8894
from video_transformers.necks import TransformerNeck
8995
from video_transformers.trainer import trainer_factory
96+
from video_transformers.utils.file import download_ucf6
9097

9198
backbone = TimeDistributed(TransformersBackbone("microsoft/cvt-13", num_unfrozen_stages=0))
9299
neck = TransformerNeck(
@@ -96,28 +103,28 @@ neck = TransformerNeck(
96103
transformer_enc_num_layers=2,
97104
dropout_p=0.1,
98105
)
99-
optimizer = AdamW(model.parameters(), lr=1e-4)
100106

107+
download_ucf6("./")
101108
datamodule = VideoDataModule(
102-
train_root=".../ucf6/train",
103-
val_root=".../ucf6/val",
104-
clip_duration=2,
105-
train_dataset_multiplier=1,
109+
train_root="ucf6/train",
110+
val_root="ucf6/val",
106111
batch_size=4,
107112
num_workers=4,
108113
num_timesteps=8,
109114
preprocess_input_size=224,
110115
preprocess_clip_duration=1,
111116
preprocess_means=backbone.mean,
112117
preprocess_stds=backbone.std,
113-
preprocess_min_short_side_scale=256,
114-
preprocess_max_short_side_scale=320,
118+
preprocess_min_short_side=256,
119+
preprocess_max_short_side=320,
115120
preprocess_horizontal_flip_p=0.5,
116121
)
117122

118123
head = LinearHead(hidden_size=neck.num_features, num_classes=datamodule.num_classes)
119124
model = VideoModel(backbone, head, neck)
120125

126+
optimizer = AdamW(model.parameters(), lr=1e-4)
127+
121128
Trainer = trainer_factory("single_label_classification")
122129
trainer = Trainer(
123130
datamodule,
@@ -139,23 +146,24 @@ from video_transformers.data import VideoDataModule
139146
from video_transformers.heads import LinearHead
140147
from video_transformers.necks import GRUNeck
141148
from video_transformers.trainer import trainer_factory
149+
from video_transformers.utils.file import download_ucf6
142150

143151
backbone = TimeDistributed(TimmBackbone("mobilevitv2_100", num_unfrozen_stages=0))
144152
neck = GRUNeck(num_features=backbone.num_features, hidden_size=128, num_layers=2, return_last=True)
145153

154+
download_ucf6("./")
146155
datamodule = VideoDataModule(
147-
train_root=".../ucf6/train",
148-
val_root=".../ucf6/val",
149-
train_dataset_multiplier=1,
156+
train_root="ucf6/train",
157+
val_root="ucf6/val",
150158
batch_size=4,
151159
num_workers=4,
152160
num_timesteps=8,
153161
preprocess_input_size=224,
154162
preprocess_clip_duration=1,
155163
preprocess_means=backbone.mean,
156164
preprocess_stds=backbone.std,
157-
preprocess_min_short_side_scale=256,
158-
preprocess_max_short_side_scale=320,
165+
preprocess_min_short_side=256,
166+
preprocess_max_short_side=320,
159167
preprocess_horizontal_flip_p=0.5,
160168
)
161169

requirements.txt

+5-6
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
1-
accelerate>=0.12.0
2-
evaluate>=0.2.2
3-
transformers>=4.23.1
4-
timm>=0.6.7
1+
accelerate>=0.14.0,<0.15.0
2+
evaluate>=0.3.0,<0.4.0
3+
transformers>=4.24.0,<4.25.0
4+
timm>=0.6.12,<0.7.0
55
click==8.0.4
6-
pytorchvideo
76
balanced-loss
87
scikit-learn
98
tensorboard
109
opencv-python
1110
gradio>=3.1.6
12-
huggingface-hub>=0.10.1
11+
huggingface-hub>=0.11.0,<0.12.0
1312
importlib-metadata>=1.1.0,<4.3;python_version<'3.8'

video_transformers/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33
from video_transformers.auto.neck import AutoNeck
44
from video_transformers.modeling import TimeDistributed, VideoModel
55

6-
__version__ = "0.0.6"
6+
__version__ = "0.0.7"

video_transformers/data.py

+20-33
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from torch.utils.data import DataLoader
1515
from torchvision.transforms import CenterCrop, Compose, Lambda, RandomCrop, RandomHorizontalFlip
1616

17+
from video_transformers.pytorchvideo_wrapper.data.labeled_video_dataset import labeled_video_dataset
1718
from video_transformers.pytorchvideo_wrapper.data.labeled_video_paths import LabeledVideoDataset, LabeledVideoPaths
1819
from video_transformers.utils.extra import class_to_config
1920

@@ -53,8 +54,8 @@ def __init__(
5354
input_size: model input isze
5455
means: mean of the video clip
5556
stds: standard deviation of the video clip
56-
min_short_side_scale: minimum short side of the video clip
57-
max_short_side_scale: maximum short side of the video clip
57+
min_short_side: minimum short side of the video clip
58+
max_short_side: maximum short side of the video clip
5859
horizontal_flip_p: probability of horizontal flip
5960
clip_duration: duration of each video clip
6061
@@ -77,10 +78,13 @@ def __init__(
7778
self.clip_duration = clip_duration
7879

7980
# Transforms applied to train dataset.
81+
def normalize_func(x):
82+
return x / 255.0
83+
8084
self.train_video_transform = Compose(
8185
[
8286
UniformTemporalSubsample(self.num_timesteps),
83-
Lambda(lambda x: x / 255.0),
87+
Lambda(normalize_func),
8488
Normalize(self.means, self.stds),
8589
RandomShortSideScale(
8690
min_size=self.min_short_side,
@@ -97,7 +101,7 @@ def __init__(
97101
self.val_video_transform = Compose(
98102
[
99103
UniformTemporalSubsample(self.num_timesteps),
100-
Lambda(lambda x: x / 255.0),
104+
Lambda(normalize_func),
101105
Normalize(self.means, self.stds),
102106
ShortSideScale(self.min_short_side),
103107
CenterCrop(self.input_size),
@@ -112,7 +116,6 @@ def __init__(
112116
train_root: str,
113117
val_root: str,
114118
test_root: str = None,
115-
train_dataset_multiplier: int = 1,
116119
batch_size: int = 4,
117120
num_workers: int = 4,
118121
num_timesteps: int = 8,
@@ -158,8 +161,6 @@ def __init__(
158161
Path to kinetics formatted train folder.
159162
clip_duration: float
160163
Duration of sampled clip for each video.
161-
train_dataset_multiplier: int
162-
Multipler for number of of random training data samples.
163164
batch_size: int
164165
Batch size for training and validation.
165166
num_workers: int
@@ -196,7 +197,6 @@ def __init__(
196197
self.train_root = train_root
197198
self.val_root = val_root
198199
self.test_root = test_root if test_root is not None else val_root
199-
self.train_dataset_multiplier = train_dataset_multiplier
200200
self.labels = None
201201

202202
self.train_dataloader = self._get_train_dataloader()
@@ -212,18 +212,13 @@ def config(self) -> Dict:
212212
return class_to_config(self, ignored_attrs=("config", "train_root", "val_root", "test_root"))
213213

214214
def _get_train_dataloader(self):
215-
labeled_video_paths = LabeledVideoPaths.from_path(self.train_root)
216-
labeled_video_paths.path_prefix = ""
217-
video_sampler = torch.utils.data.RandomSampler
218215
clip_sampler = pytorchvideo.data.make_clip_sampler("random", self.preprocessor_config["clip_duration"])
219-
dataset = LabeledVideoDataset(
220-
labeled_video_paths,
221-
clip_sampler,
222-
video_sampler,
223-
self.preprocessor.train_transform,
216+
dataset = labeled_video_dataset(
217+
data_path=self.train_root,
218+
clip_sampler=clip_sampler,
219+
transform=self.preprocessor.train_transform,
224220
decode_audio=False,
225221
decoder="pyav",
226-
dataset_multiplier=self.train_dataset_multiplier,
227222
)
228223
self.labels = dataset.labels
229224
return DataLoader(
@@ -234,18 +229,14 @@ def _get_train_dataloader(self):
234229
)
235230

236231
def _get_val_dataloader(self):
237-
labeled_video_paths = LabeledVideoPaths.from_path(self.val_root)
238-
labeled_video_paths.path_prefix = ""
239-
video_sampler = torch.utils.data.SequentialSampler
240232
clip_sampler = pytorchvideo.data.clip_sampling.UniformClipSamplerTruncateFromStart(
241233
clip_duration=self.preprocessor_config["clip_duration"],
242234
truncation_duration=self.preprocessor_config["clip_duration"],
243235
)
244-
dataset = LabeledVideoDataset(
245-
labeled_video_paths,
246-
clip_sampler,
247-
video_sampler,
248-
self.preprocessor.val_transform,
236+
dataset = labeled_video_dataset(
237+
data_path=self.val_root,
238+
clip_sampler=clip_sampler,
239+
transform=self.preprocessor.val_transform,
249240
decode_audio=False,
250241
decoder="pyav",
251242
)
@@ -257,18 +248,14 @@ def _get_val_dataloader(self):
257248
)
258249

259250
def _get_test_dataloader(self):
260-
labeled_video_paths = LabeledVideoPaths.from_path(self.test_root)
261-
labeled_video_paths.path_prefix = ""
262-
video_sampler = torch.utils.data.SequentialSampler
263251
clip_sampler = pytorchvideo.data.clip_sampling.UniformClipSamplerTruncateFromStart(
264252
clip_duration=self.preprocessor_config["clip_duration"],
265253
truncation_duration=self.preprocessor_config["clip_duration"],
266254
)
267-
dataset = LabeledVideoDataset(
268-
labeled_video_paths,
269-
clip_sampler,
270-
video_sampler,
271-
self.preprocessor.val_transform,
255+
dataset = labeled_video_dataset(
256+
data_path=self.test_root,
257+
clip_sampler=clip_sampler,
258+
transform=self.preprocessor.val_transform,
272259
decode_audio=False,
273260
decoder="pyav",
274261
)

video_transformers/pytorchvideo_wrapper/__init__.py

Whitespace-only changes.

video_transformers/pytorchvideo_wrapper/data/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)