support ray actor

Cathy0908 · Cathy0908 · commit f27e8c7be3b3 · 2024-12-11T20:38:03.000+08:00
diff --git a/data_juicer/core/ray_data.py b/data_juicer/core/ray_data.py
@@ -110,15 +110,55 @@ def _run_single_op(self, op):
             batch_size = getattr(op, 'batch_size',
                                  1) if op.is_batched_op() else 1
             if isinstance(op, Mapper):
-                self.data = self.data.map_batches(op.process,
-                                                  batch_size=batch_size,
-                                                  batch_format='pyarrow',
-                                                  num_gpus=num_gpus)
+                if op.use_ray_actor():
+                    # TODO: auto calculate concurrency
+                    concurrency = getattr(op, 'concurrency', 1)
+
+                    init_params = op._init_parameters
+                    op_args = init_params.pop('args', ())
+                    op_kwargs = init_params.pop('kwargs', {})
+                    op_kwargs.update(init_params)
+
+                    self.data = self.data.map_batches(
+                        op.__class__,
+                        fn_args=None,
+                        fn_kwargs=None,
+                        fn_constructor_args=op_args,
+                        fn_constructor_kwargs=op_kwargs,
+                        batch_size=batch_size,
+                        num_gpus=num_gpus,
+                        concurrency=concurrency,
+                        batch_format='pyarrow')
+                else:
+                    self.data = self.data.map_batches(op.process,
+                                                      batch_size=batch_size,
+                                                      batch_format='pyarrow',
+                                                      num_gpus=num_gpus)
             elif isinstance(op, Filter):
-                self.data = self.data.map_batches(op.compute_stats,
-                                                  batch_size=batch_size,
-                                                  batch_format='pyarrow',
-                                                  num_gpus=num_gpus)
+                if op.use_ray_actor():
+                    # TODO: auto calculate concurrency
+                    concurrency = getattr(op, 'concurrency', 1)
+
+                    init_params = op._init_parameters
+                    op_args = init_params.pop('args', ())
+                    op_kwargs = init_params.pop('kwargs', {})
+                    op_kwargs.update(init_params)
+
+                    self.data = self.data.map_batches(
+                        op.__class__,
+                        fn_args=None,
+                        fn_kwargs=None,
+                        fn_constructor_args=op_args,
+                        fn_constructor_kwargs=op_kwargs,
+                        batch_size=batch_size,
+                        num_gpus=num_gpus,
+                        concurrency=concurrency,
+                        batch_format='pyarrow')
+                else:
+                    self.data = self.data.map_batches(op.compute_stats,
+                                                      batch_size=batch_size,
+                                                      batch_format='pyarrow',
+                                                      num_gpus=num_gpus)
                 if op.stats_export_path is not None:
                     self.data.write_json(op.stats_export_path,
                                          force_ascii=False)
diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py
@@ -118,6 +118,7 @@ class OP:
 
     _accelerator = 'cpu'
     _batched_op = False
+    _ray_mode = 'task'
 
     def __init__(self, *args, **kwargs):
         """
@@ -143,6 +144,7 @@ def __init__(self, *args, **kwargs):
         self.history_key = kwargs.get('history_key', 'history')
 
         self.batch_size = kwargs.get('batch_size', 1000)
+        self.concurrency = kwargs.get('concurrency', 1)
 
         # whether the model can be accelerated using cuda
         _accelerator = kwargs.get('accelerator', None)
@@ -172,6 +174,9 @@ def __init__(self, *args, **kwargs):
     def is_batched_op(self):
         return self._batched_op
 
+    def use_ray_actor(self):
+        return self._ray_mode == 'actor'
+
     def process(self, *args, **kwargs):
         raise NotImplementedError
 
@@ -255,6 +260,9 @@ def __init_subclass__(cls, **kwargs):
                     f'{cls.__name__}. Please implement {method_name}_single '
                     f'or {method_name}_batched.')
 
+    def __call__(self, *args, **kwargs):
+        return self.process(*args, **kwargs)
+
     def process_batched(self, samples, *args, **kwargs):
         keys = samples.keys()
         first_key = next(iter(keys))
@@ -330,6 +338,9 @@ def __init_subclass__(cls, **kwargs):
                     f'{cls.__name__}. Please implement {method_name}_single '
                     f'or {method_name}_batched.')
 
+    def __call__(self, *args, **kwargs):
+        return self.compute_stats(*args, **kwargs)
+
     def compute_stats_batched(self, samples, *args, **kwargs):
         keys = samples.keys()
         num_samples = len(samples[Fields.stats])
diff --git a/data_juicer/ops/filter/image_nsfw_filter.py b/data_juicer/ops/filter/image_nsfw_filter.py
@@ -19,6 +19,7 @@ class ImageNSFWFilter(Filter):
     """Filter to keep samples whose images have low nsfw scores."""
 
     _accelerator = 'cuda'
+    _ray_mode = 'actor'
 
     def __init__(self,
                  hf_nsfw_model: str = 'Falconsai/nsfw_image_detection',
@@ -42,6 +43,8 @@ def __init__(self,
         :param kwargs: extra args
         """
         super().__init__(*args, **kwargs)
+        self._init_parameters = self.remove_extra_parameters(locals())
+
         self.score_threshold = score_threshold
         if any_or_all not in ['any', 'all']:
             raise ValueError(f'Keep strategy [{any_or_all}] is not supported. '
diff --git a/data_juicer/ops/mapper/image_captioning_mapper.py b/data_juicer/ops/mapper/image_captioning_mapper.py
@@ -30,6 +30,7 @@ class ImageCaptioningMapper(Mapper):
 
     _accelerator = 'cuda'
     _batched_op = True
+    _ray_mode = 'actor'
 
     def __init__(self,
                  hf_img2seq: str = 'Salesforce/blip2-opt-2.7b',
@@ -82,6 +83,7 @@ def __init__(self,
         :param kwargs: extra args
         """
         super().__init__(*args, **kwargs)
+        self._init_parameters = self.remove_extra_parameters(locals())
 
         if keep_candidate_mode not in [
                 'random_any', 'similar_one_simhash', 'all'
@@ -282,6 +284,11 @@ def process_batched(self, samples, rank=None):
         :param samples:
         :return:
         """
+        import pyarrow as pa
+
+        if isinstance(samples, pa.Table):
+            samples = samples.to_pydict()
+
         # reconstruct samples from "dict of lists" to "list of dicts"
         reconstructed_samples = []
         for i in range(len(samples[self.text_key])):