5
5
from pydantic import PositiveInt
6
6
7
7
from data_juicer .utils .constant import Fields
8
- from data_juicer .utils .file_utils import create_directory_if_not_exists
8
+ from data_juicer .utils .file_utils import dict_to_hash
9
9
from data_juicer .utils .mm_utils import (
10
10
SpecialTokens , close_video , extract_key_frames ,
11
11
extract_key_frames_by_seconds , extract_video_frames_uniformly ,
@@ -24,17 +24,11 @@ class VideoExtractFramesMapper(Mapper):
24
24
"""Mapper to extract frames from video files according to specified methods.
25
25
Extracted Frames Data Format:
26
26
The data format for the extracted frames is a dictionary mapping
27
- video keys to lists of file paths where the extracted frames are saved.
28
- The dictionary follows the structure:
27
+ video key to extracted frames directory where the extracted
28
+ frames are saved. The dictionary follows the structure:
29
29
{
30
- "video_key_1": [
31
- "/${frame_dir}/video_key_1_filename/frame_1.jpg",
32
- "/${frame_dir}/video_key_1_filename/frame_2.jpg",
33
- ...],
34
- "video_key_2": [
35
- "/${frame_dir}/video_key_2_filename/frame_1.jpg",
36
- "/${frame_dir}/video_key_2_filename/frame_2.jpg",
37
- ...],
30
+ "video_key_1": "/${frame_dir}/video_key_1_filename/",
31
+ "video_key_2": "/${frame_dir}/video_key_2_filename/",
38
32
...
39
33
}
40
34
"""
@@ -99,11 +93,13 @@ def _get_default_frame_dir(self, original_filepath):
99
93
dir_token = f'/{ Fields .multimodal_data_output_dir } /'
100
94
if dir_token in original_dir :
101
95
original_dir = original_dir .split (dir_token )[0 ]
102
- new_dir = os .path .join (
96
+ saved_dir = os .path .join (
103
97
original_dir , f'{ Fields .multimodal_data_output_dir } /{ OP_NAME } ' )
104
- create_directory_if_not_exists (new_dir )
105
- return osp .join (new_dir ,
106
- osp .splitext (osp .basename (original_filepath ))[0 ])
98
+ original_filename = osp .splitext (osp .basename (original_filepath ))[0 ]
99
+ hash_val = dict_to_hash (self ._init_parameters )
100
+
101
+ return osp .join (saved_dir ,
102
+ f'{ original_filename } __dj_hash_#{ hash_val } #' )
107
103
108
104
def process_single (self , sample , context = False ):
109
105
# check if it's generated already
0 commit comments