Cathy0908
diff --git a/‎README.md
Lines changed: 16 additions & 0 deletions b/‎README.md
Lines changed: 16 additions & 0 deletions
diff --git a/‎README_ZH.md
Lines changed: 15 additions & 0 deletions b/‎README_ZH.md
Lines changed: 15 additions & 0 deletions
diff --git a/‎configs/config_all.yaml
Lines changed: 5 additions & 0 deletions b/‎configs/config_all.yaml
Lines changed: 5 additions & 0 deletions
diff --git a/‎data_juicer/core/ray_data.py
Lines changed: 45 additions & 29 deletions b/‎data_juicer/core/ray_data.py
Lines changed: 45 additions & 29 deletions
diff --git a/‎data_juicer/ops/base_op.py
Lines changed: 13 additions & 2 deletions b/‎data_juicer/ops/base_op.py
Lines changed: 13 additions & 2 deletions
diff --git a/‎data_juicer/ops/filter/flagged_words_filter.py
Lines changed: 56 additions & 48 deletions b/‎data_juicer/ops/filter/flagged_words_filter.py
Lines changed: 56 additions & 48 deletions
@@ -197,6 +197,22 @@ The dependency options are listed below:
 | `.[tools]`       | Install dependencies for dedicated tools, such as quality classifiers.                       |
 | `.[sandbox]`     | Install all dependencies for sandbox.                                                        |
 
+- Install dependencies for specific OPs
+
+With the growth of the number of OPs, the dependencies of all OPs becomes very heavy. Instead of using the command `pip install -v -e .[sci]` to install all dependencies,
+we provide two alternative, lighter options:
+
+  - Automatic Minimal Dependency Installation: During the execution of Data-Juicer, minimal dependencies will be automatically installed. This allows for immediate execution, but may potentially lead to dependency conflicts.
+
+  - Manual Minimal Dependency Installation: To manually install minimal dependencies tailored to a specific execution configuration, run the following command:
+    ```shell
+    # only for installation from source
+    python tools/dj_install.py --config path_to_your_data-juicer_config_file
+
+    # use command line tool
+    dj-install --config path_to_your_data-juicer_config_file
+    ```
+
 ### Using pip
 
 - Run the following command to install the latest released `data_juicer` using `pip`:
 
@@ -178,6 +178,21 @@ pip install -v -e .[tools] # 安装部分工具库的依赖
 | `.[tools]`       | 安装专用工具库（如质量分类器）所需的依赖项        |
 | `.[sandbox]`     | 安装沙盒实验室的基础依赖                 |
 
+* 只安装部分算子依赖
+
+随着OP数量的增长，所有OP的依赖变得很重。为此，我们提供了两个替代的、更轻量的选项，作为使用命令`pip install -v -e .[sci]`安装所有依赖的替代：
+
+  * 自动最小依赖安装：在执行Data-Juicer的过程中，将自动安装最小依赖。也就是说你可以直接执行，但这种方式可能会导致一些依赖冲突。
+
+  * 手动最小依赖安装：可以通过如下指令手动安装适合特定执行配置的最小依赖：
+    ```shell
+    # 适用于从源码安装
+    python tools/dj_install.py --config path_to_your_data-juicer_config_file
+    
+    # 使用命令行工具
+    dj-install --config path_to_your_data-juicer_config_file
+    ```
+
 ### 使用 pip 安装
 
 * 运行以下命令用 `pip` 安装 `data_juicer` 的最新发布版本：
 
@@ -341,6 +341,11 @@ process:
       horizontal_flip: false                                  # flip frame image horizontally (left to right).
       vertical_flip: false                                    # flip frame image vertically (top to bottom).
       mem_required: '20GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
+  - video_extract_frames_mapper:                            # extract frames from video files according to specified methods
+      frame_sampling_method: 'all_keyframes'                  # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
+      frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
+      duration: 0                                             # The duration of each segment in seconds. If 0, frames are extracted from the entire video. If duration > 0, the video is segmented into multiple segments based on duration, and frames are extracted from each segment.
+      frame_dir: None                                         # Output directory to save extracted frames. If None, a default directory based on the video file path is used.
   - video_face_blur_mapper:                                 # blur faces detected in videos
       cv_classifier: ''                                       # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'.
       blur_type: 'gaussian'                                   # type of blur kernel, including ['mean', 'box', 'gaussian']
 
@@ -1,4 +1,5 @@
 import os
+from functools import partial
 
 import pyarrow as pa
 from loguru import logger
@@ -13,28 +14,26 @@
 rd = LazyLoader('rd', 'ray.data')
 
 
-def is_valid_path(item, dataset_dir):
-    full_path = os.path.abspath(os.path.join(dataset_dir, item))
-    return os.path.exists(full_path)
+def get_abs_path(path, dataset_dir):
+    full_path = os.path.abspath(os.path.join(dataset_dir, path))
+    if os.path.exists(full_path):
+        return full_path
+    else:
+        return path
 
 
-def convert_to_absolute_paths(dict_with_paths, dataset_dir, path_keys):
+def convert_to_absolute_paths(samples, dataset_dir, path_keys):
+    samples = samples.to_pydict()
     for key in path_keys:
-        if key not in dict_with_paths:
-            continue
-        if isinstance(dict_with_paths[key], list):
-            dict_with_paths[key] = [
-                os.path.abspath(os.path.join(dataset_dir, item))
-                if isinstance(item, str) and is_valid_path(dataset_dir, item)
-                else item for item in dict_with_paths[key]
-            ]
-        elif isinstance(dict_with_paths[key], str):
-            dict_with_paths[key] = os.path.abspath(
-                os.path.join(dataset_dir,
-                             dict_with_paths[key])) if is_valid_path(
-                                 dict_with_paths[key],
-                                 dataset_dir) else dict_with_paths[key]
-    return dict_with_paths
+        for idx in range(len(samples[key])):
+            paths = samples[key][idx]
+            if isinstance(paths, str):
+                samples[key][idx] = get_abs_path(paths, dataset_dir)
+            elif isinstance(paths, list):
+                samples[key][idx] = [
+                    get_abs_path(item, dataset_dir) for item in paths
+                ]
+    return pa.Table.from_pydict(samples)
 
 
 # TODO: check path for nestdataset
@@ -43,22 +42,26 @@ def set_dataset_to_absolute_path(dataset, dataset_path, cfg):
     Set all the path in input data to absolute path.
     Checks dataset_dir and project_dir for valid paths.
     """
-    if not (cfg.video_key in dataset.columns() or cfg.image_key
-            in dataset.columns() or cfg.audio_key in dataset.columns()):
-        return dataset
-    dataset_dir = os.path.dirname(dataset_path)
-    dataset = dataset.map(lambda item: convert_to_absolute_paths(
-        item, dataset_dir, [cfg.video_key, cfg.image_key, cfg.audio_key]))
-    logger.info(f"transfer {dataset.count()} sample's paths")
+    path_keys = []
+    columns = dataset.columns()
+    for key in [cfg.video_key, cfg.image_key, cfg.audio_key]:
+        if key in columns:
+            path_keys.append(key)
+    if len(path_keys) > 0:
+        dataset_dir = os.path.dirname(dataset_path)
+        dataset = dataset.map_batches(partial(convert_to_absolute_paths,
+                                              dataset_dir=dataset_dir,
+                                              path_keys=path_keys),
+                                      batch_format='pyarrow',
+                                      zero_copy_batch=True)
     return dataset
 
 
 def preprocess_dataset(dataset: rd.Dataset, dataset_path, cfg) -> rd.Dataset:
+    columns = dataset.columns()
     if dataset_path:
         dataset = set_dataset_to_absolute_path(dataset, dataset_path, cfg)
-    columns = dataset.columns()
     if Fields.stats not in columns:
-        logger.info(f'columns {columns}')
 
         def process_batch_arrow(table: pa.Table) -> pa.Table:
             new_column_data = [{} for _ in range(len(table))]
@@ -77,6 +80,11 @@ def get_num_gpus(op, op_proc):
     return 1.0 / proc_per_gpu
 
 
+def filter_batch(batch, filter_func):
+    mask = pa.array(filter_func(batch.to_pydict()))
+    return batch.filter(mask)
+
+
 class RayDataset(DJDataset):
 
     def __init__(self,
@@ -162,7 +170,15 @@ def _run_single_op(self, op):
                 if op.stats_export_path is not None:
                     self.data.write_json(op.stats_export_path,
                                          force_ascii=False)
-                self.data = self.data.filter(op.process)
+                if op.is_batched_op():
+                    self.data = self.data.map_batches(partial(
+                        filter_batch, filter_func=op.process),
+                                                      batch_format='pyarrow',
+                                                      batch_size=batch_size,
+                                                      num_gpus=num_gpus,
+                                                      zero_copy_batch=True)
+                else:
+                    self.data = self.data.filter(op.process)
             else:
                 logger.error(
                     'Ray executor only support Filter and Mapper OPs for now')
 
@@ -267,11 +267,22 @@ def process_batched(self, samples, *args, **kwargs):
         keys = samples.keys()
         first_key = next(iter(keys))
         num_samples = len(samples[first_key])
+
+        new_keys = {}
         for i in range(num_samples):
             this_sample = {key: samples[key][i] for key in keys}
             res_sample = self.process_single(this_sample, *args, **kwargs)
-            for key in keys:
-                samples[key][i] = res_sample[key]
+            res_keys = res_sample.keys()
+            for key in res_keys:
+                if key not in keys:
+                    if key not in new_keys:
+                        new_keys.update({key: []})
+                    new_keys[key].append(res_sample[key])
+                else:
+                    samples[key][i] = res_sample[key]
+
+        for k, v in new_keys.items():
+            samples[k] = v
 
         return samples
 
 
@@ -24,6 +24,8 @@ class FlaggedWordFilter(Filter):
     """Filter to keep samples with flagged-word ratio less than a specific max
     value."""
 
+    _batched_op = True
+
     def __init__(self,
                  lang: str = 'en',
                  tokenization: bool = False,
@@ -72,53 +74,59 @@ def __init__(self,
             self.model_key = prepare_model(model_type='sentencepiece',
                                            lang=lang)
 
-    def compute_stats_single(self, sample, context=False):
+    def compute_stats_batched(self, samples, context=False):
         # check if it's computed already
-        if StatsKeys.flagged_words_ratio in sample[Fields.stats]:
-            return sample
-
-        # try to get words from context
+        samples_list = samples[self.text_key]
+        samples_stats = samples[Fields.stats]
         words_key = f'{InterVars.words}-{self.model_key}'
-        if context and words_key in sample[Fields.context]:
-            words = sample[Fields.context][words_key]
-        else:
-            tokenizer = get_model(self.model_key)
-            words = get_words_from_document(
-                sample[self.text_key],
-                token_func=tokenizer.encode_as_pieces if tokenizer else None)
-            if context:
-                sample[Fields.context][words_key] = words
-
-        # try to get refined words from context
-        refined_words_key = f'{InterVars.refined_words}-True-SPECIAL_CHARS-' \
-                            f'{self.use_words_aug}-' \
-                            f'{self.words_aug_group_sizes}-' \
-                            f'{self.words_aug_join_char}'
-        if context and refined_words_key in sample[Fields.context]:
-            words = sample[Fields.context][refined_words_key]
-        else:
-            words = words_refinement(
-                words,
-                lower_case=True,
-                strip_chars=SPECIAL_CHARACTERS,
-                use_words_aug=self.use_words_aug,
-                words_aug_group_sizes=self.words_aug_group_sizes,
-                words_aug_join_char=self.words_aug_join_char)
-            if context:
-                sample[Fields.context][refined_words_key] = words
-
-        flagged_words_ratio = (len(
-            [word
-             for word in words if word in self.FLAGGED_WORDS[self.lang]]) /
-                               len(words)) if len(words) != 0 else 0.0
-
-        if flagged_words_ratio > 1.0:
-            flagged_words_ratio = 1.0
-
-        sample[Fields.stats][
-            StatsKeys.flagged_words_ratio] = flagged_words_ratio
-        return sample
-
-    def process_single(self, sample):
-        return sample[Fields.stats][
-            StatsKeys.flagged_words_ratio] <= self.max_ratio
+        tokenizer = get_model(self.model_key)
+        for idx, stat in enumerate(samples_stats):
+            if StatsKeys.flagged_words_ratio in stat:
+                continue
+            if context and words_key in samples[Fields.context][idx]:
+                words = samples[Fields.context][idx][words_key]
+            else:
+                words = get_words_from_document(
+                    samples_list[idx],
+                    token_func=tokenizer.encode_as_pieces
+                    if tokenizer else None)
+                if context:
+                    samples[Fields.context][idx][words_key] = words
+            # try to get refined words from context
+            refined_words_key = f'{InterVars.refined_words}' \
+                                '-True-SPECIAL_CHARS-' \
+                                f'{self.use_words_aug}-' \
+                                f'{self.words_aug_group_sizes}-' \
+                                f'{self.words_aug_join_char}'
+            if context and refined_words_key in samples[Fields.context][idx]:
+                words = samples[Fields.context][idx][refined_words_key]
+            else:
+                words = words_refinement(
+                    words,
+                    lower_case=True,
+                    strip_chars=SPECIAL_CHARACTERS,
+                    use_words_aug=self.use_words_aug,
+                    words_aug_group_sizes=self.words_aug_group_sizes,
+                    words_aug_join_char=self.words_aug_join_char)
+                if context:
+                    samples[Fields.context][idx][refined_words_key] = words
+
+            flagged_words_ratio = (len([
+                word for word in words if word in self.FLAGGED_WORDS[self.lang]
+            ]) / len(words)) if len(words) != 0 else 0.0
+
+            if flagged_words_ratio > 1.0:
+                flagged_words_ratio = 1.0
+
+            samples_stats[idx][
+                StatsKeys.flagged_words_ratio] = flagged_words_ratio
+
+        return samples
+
+    def process_batched(self, samples):
+        return list(
+            map(
+                lambda stat: stat[StatsKeys.flagged_words_ratio] <= self.
+                max_ratio,
+                samples[Fields.stats],
+            ))