Keiku
diff --git a/‎Pipfile
+31 b/‎Pipfile
+31
diff --git a/‎Pipfile.lock
+923 b/‎Pipfile.lock
+923
diff --git a/‎README.md
+55-2 b/‎README.md
+55-2
diff --git a/‎configs/debug/train_debug.yaml
+9 b/‎configs/debug/train_debug.yaml
+9
diff --git a/‎configs/default.yaml
+55 b/‎configs/default.yaml
+55
diff --git a/‎configs/experiments/test_exp01.yaml
+5 b/‎configs/experiments/test_exp01.yaml
+5
diff --git a/‎configs/experiments/test_on_video_exp01.yaml
+5 b/‎configs/experiments/test_on_video_exp01.yaml
+5
diff --git a/‎configs/experiments/train_exp01.yaml
+9 b/‎configs/experiments/train_exp01.yaml
+9
diff --git a/‎data/check_extract_frames.py
+16 b/‎data/check_extract_frames.py
+16
diff --git a/‎data/download_ucf101.sh
+8 b/‎data/download_ucf101.sh
+8
diff --git a/‎data/extract_frames.py
+51 b/‎data/extract_frames.py
+51
@@ -0,0 +1,31 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+av = "*"
+pandas = "*"
+torch = "==1.7.1"
+torchvision = "==0.8.2"
+hydra-core = "*"
+tensorboard = "==2.3.0"
+logzero = "*"
+coloredlogs = "*"
+hydra-colorlog = "*"
+tqdm = "*"
+scikit-video = "*"
+hydra = "*"
+fvcore = "*"
+
+[dev-packages]
+isort = "*"
+ipdb = "*"
+black = "*"
+vulture = "*"
+
+[requires]
+python_version = "3.7"
+
+[pipenv]
+allow_prereleases = true
@@ -1,2 +1,55 @@
-# Action-Recognition-CNN-LSTM
-Action recognition tutorial using UCF-101 dataset.
+# Action Recognition in Video
+
+This repo will serve as a playground where I investigate different approaches to solving the problem of action recognition in video.
+
+I will mainly use the [UCF-101 dataset](https://www.crcv.ucf.edu/data/UCF101.php).
+
+<p align="center">
+    <img src="assets/crawling.gif" width="400"\>
+</p>
+
+## Setup
+
+```
+$ cd data/              
+$ bash download_ucf101.sh     # Downloads the UCF-101 dataset (~7.2 GB)
+$ unrar x UCF101.rar          # Unrars dataset
+$ unzip ucfTrainTestlist.zip  # Unzip train / test split
+$ python3 extract_frames.py   # Extracts frames from the video (~26.2 GB, go grab a coffee for this)
+```
+
+## ConvLSTM
+
+The only approach investigated so far. Enables action recognition in video by a bi-directional LSTM operating on frame embeddings extracted by a pre-trained ResNet-152 (ImageNet).
+
+The model is composed of:
+* A convolutional feature extractor (ResNet-152) which provides a latent representation of video frames
+* A bi-directional LSTM classifier which based on the latent representation of the video predicts the activity depicted
+
+I have made a trained model available [here](https://drive.google.com/open?id=1GlpN0m9uLbI9dg1ARbW9hDEf-VWe4Asl).
+
+### Train  
+
+```
+$ python3 train.py  --dataset_path data/UCF-101-frames/ \
+                    --split_path data/ucfTrainTestlist \
+                    --num_epochs 200 \
+                    --sequence_length 40 \
+                    --img_dim 112 \
+                    --latent_dim 512
+```
+
+### Test on Video
+
+```
+$ python3 test_on_video.py  --video_path data/UCF-101/SoccerPenalty/v_SoccerPenalty_g01_c01.avi \
+                            --checkpoint_model model_checkpoints/ConvLSTM_150.pth
+```
+
+<p align="center">
+    <img src="assets/penalty.gif" width="400"\>
+</p>
+
+### Results
+
+The model reaches a classification accuracy of **91.27%** accuracy on a randomly sampled test set, composed of 20% of the total amount of video sequences from UCF-101. Will re-train this model on the offical train / test splits and post results as soon as I have time.
@@ -0,0 +1,9 @@
+# @package _group_
+
+# train
+train:
+  num_epochs: 2
+  batch_size: 1
+  sequence_length: 2
+  img_dim: 112
+  num_workers: 0
@@ -0,0 +1,55 @@
+defaults:
+  - hydra/job_logging: colorlog
+  - hydra/hydra_logging: colorlog
+
+hydra:
+  run:
+    dir: ./outputs
+  output_subdir: ./configs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  job:
+    name: log_${now:%Y-%m-%d}_${now:%H-%M-%S}
+
+# datasets
+dataset:
+  root: '/mnt/nfs/kuroyanagi/clones/Action-Recognition-CNN-LSTM/data'
+  name: 'UCF-101'
+  frames: 'UCF-101-frames'
+  split_file: 'ucfTrainTestlist'
+  split_number: 1
+
+# train
+train:
+  num_epochs: 100
+  batch_size: 16
+  sequence_length: 40
+  image_height: 224
+  image_width: 224
+  channels: 3
+  latent_dim: 512
+  lstm_layers: 1
+  hidden_dim: 1024
+  bidirectional: True
+  attention: True
+  num_workers: 4
+  checkpoint_model: ''
+  checkpoint_interval: 5
+  checkpoints_dir: 'checkpoints'
+  tensorboard_dir: 'logs'
+  resume: True
+
+# test or test_on_video
+test:
+  num_classes: 101
+  batch_size: 16
+  sequence_length: 40
+  image_height: 224
+  image_width: 224
+  channels: 3
+  latent_dim: 512
+  lstm_layers: 1
+  hidden_dim: 1024
+  bidirectional: True
+  attention: True
+  num_workers: 4
+  checkpoint_model: '/mnt/nfs/kuroyanagi/clones/Action-Recognition-CNN-LSTM/experiments/exp01/model_checkpoints/ConvLSTM_45.pth'
+  video_name: 'BabyCrawling/v_BabyCrawling_g01_c01.avi'
@@ -0,0 +1,5 @@
+# @package _group_
+
+# test
+test:
+  checkpoint_model: '/mnt/nfs/kuroyanagi/clones/Action-Recognition-CNN-LSTM/experiments/exp01/model_checkpoints/ConvLSTM_45.pth'
@@ -0,0 +1,5 @@
+# @package _group_
+
+# input video
+test:
+  video_name: 'ApplyEyeMakeup/v_ApplyEyeMakeup_g25_c07.avi'
@@ -0,0 +1,9 @@
+# @package _group_
+
+# train
+train:
+  batch_size: 8
+  num_epochs: 10
+  sequence_length: 20
+  checkpoint_interval: 5
+  num_workers: 4
@@ -0,0 +1,16 @@
+import os
+import glob
+import argparse
+
+parser = argparse.ArgumentParser()
+opt = parser.parse_args()
+opt.dataset_frames_path = 'UCF-101-frames'
+
+video_frame_paths = glob.glob(os.path.join(opt.dataset_frames_path, "*", "*"))
+
+for i, video_frame_path in enumerate(video_frame_paths):
+    video_frame_len = len(glob.glob(os.path.join(video_frame_path, "*")))
+    if(video_frame_len==0):
+        print(i, video_frame_path)
+
+# 49 UCF-101-frames/PlayingGuitar/v_PlayingGuitar_g21_c02
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+# Downloads the UCF-101 dataset
+wget --no-check-certificate https://www.crcv.ucf.edu/data/UCF101/UCF101.rar
+wget --no-check-certificate https://www.crcv.ucf.edu/data/UCF101/UCF101TrainTestSplits-RecognitionTask.zip
+# Unzip the UCF-101 dataset
+unrar x UCF101.rar
+unzip UCF101TrainTestSplits-RecognitionTask.zip
@@ -0,0 +1,51 @@
+"""
+Helper script for extracting frames from the UCF-101 dataset
+"""
+
+import av
+import glob
+import os
+import time
+import tqdm
+import datetime
+import argparse
+
+
+def extract_frames(video_path):
+    frames = []
+    video = av.open(video_path)
+    for frame in video.decode(0):
+        yield frame.to_image()
+
+
+prev_time = time.time()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset_path", type=str, default="UCF-101", help="Path to UCF-101 dataset")
+    opt = parser.parse_args()
+    print(opt)
+
+    time_left = 0
+    video_paths = glob.glob(os.path.join(opt.dataset_path, "*", "*.avi"))
+    for i, video_path in enumerate(video_paths):
+        sequence_type, sequence_name = video_path.split(".avi")[0].split("/")[-2:]
+        sequence_path = os.path.join(f"{opt.dataset_path}-frames", sequence_type, sequence_name)
+
+        if os.path.exists(sequence_path):
+            continue
+
+        os.makedirs(sequence_path, exist_ok=True)
+
+        # Extract frames
+        for j, frame in enumerate(
+            tqdm.tqdm(
+                extract_frames(video_path),
+                desc=f"[{i}/{len(video_paths)}] {sequence_name} : ETA {time_left}",
+            )
+        ):
+            frame.save(os.path.join(sequence_path, f"{j}.jpg"))
+
+        # Determine approximate time left
+        videos_left = len(video_paths) - (i + 1)
+        time_left = datetime.timedelta(seconds=videos_left * (time.time() - prev_time))
+        prev_time = time.time()