remove attr _ray_mode and optimize api

Cathy0908 · Cathy0908 · commit 3c10162781af · 2024-12-20T14:20:19.000+08:00
diff --git a/data_juicer/core/ray_data.py b/data_juicer/core/ray_data.py
@@ -118,10 +118,7 @@ def _run_single_op(self, op):
             batch_size = getattr(op, 'batch_size',
                                  1) if op.is_batched_op() else 1
             if isinstance(op, Mapper):
-                if op.use_ray_actor():
-                    # TODO: auto calculate concurrency
-                    concurrency = getattr(op, 'concurrency', 1)
-
+                if op.use_cuda():
                     init_params = op._init_parameters
                     op_args = init_params.pop('args', ())
                     op_kwargs = init_params.pop('kwargs', {})
@@ -135,18 +132,15 @@ def _run_single_op(self, op):
                         fn_constructor_kwargs=op_kwargs,
                         batch_size=batch_size,
                         num_gpus=num_gpus,
-                        concurrency=concurrency,
+                        concurrency=op_proc,
                         batch_format='pyarrow')
                 else:
                     self.data = self.data.map_batches(op.process,
                                                       batch_size=batch_size,
                                                       batch_format='pyarrow',
                                                       num_gpus=num_gpus)
             elif isinstance(op, Filter):
-                if op.use_ray_actor():
-                    # TODO: auto calculate concurrency
-                    concurrency = getattr(op, 'concurrency', 1)
-
+                if op.use_cuda():
                     init_params = op._init_parameters
                     op_args = init_params.pop('args', ())
                     op_kwargs = init_params.pop('kwargs', {})
@@ -160,7 +154,7 @@ def _run_single_op(self, op):
                         fn_constructor_kwargs=op_kwargs,
                         batch_size=batch_size,
                         num_gpus=num_gpus,
-                        concurrency=concurrency,
+                        concurrency=op_proc,
                         batch_format='pyarrow')
                 else:
                     self.data = self.data.map_batches(op.compute_stats,
diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py
@@ -118,7 +118,6 @@ class OP:
 
     _accelerator = 'cpu'
     _batched_op = False
-    _ray_mode = 'task'
 
     def __init__(self, *args, **kwargs):
         """
@@ -174,9 +173,6 @@ def __init__(self, *args, **kwargs):
     def is_batched_op(self):
         return self._batched_op
 
-    def use_ray_actor(self):
-        return self._ray_mode == 'actor'
-
     def process(self, *args, **kwargs):
         raise NotImplementedError
 
diff --git a/data_juicer/ops/filter/image_nsfw_filter.py b/data_juicer/ops/filter/image_nsfw_filter.py
@@ -19,7 +19,6 @@ class ImageNSFWFilter(Filter):
     """Filter to keep samples whose images have low nsfw scores."""
 
     _accelerator = 'cuda'
-    _ray_mode = 'actor'
 
     def __init__(self,
                  hf_nsfw_model: str = 'Falconsai/nsfw_image_detection',
diff --git a/data_juicer/ops/mapper/image_captioning_mapper.py b/data_juicer/ops/mapper/image_captioning_mapper.py
@@ -30,7 +30,6 @@ class ImageCaptioningMapper(Mapper):
 
     _accelerator = 'cuda'
     _batched_op = True
-    _ray_mode = 'actor'
 
     def __init__(self,
                  hf_img2seq: str = 'Salesforce/blip2-opt-2.7b',
diff --git a/data_juicer/utils/process_utils.py b/data_juicer/utils/process_utils.py
@@ -57,32 +57,50 @@ def calculate_np(name,
     """Calculate the optimum number of processes for the given OP"""
     eps = 1e-9  # about 1 byte
 
-    if num_proc is None:
-        num_proc = psutil.cpu_count()
-
     if use_cuda:
+        auto_num_proc = None
         cuda_mem_available = get_min_cuda_memory() / 1024
-        op_proc = min(
-            num_proc,
-            math.floor(cuda_mem_available / (mem_required + eps)) *
-            cuda_device_count())
-        if use_cuda and mem_required == 0:
+        if mem_required == 0:
             logger.warning(f'The required cuda memory of Op[{name}] '
                            f'has not been specified. '
                            f'Please specify the mem_required field in the '
                            f'config file, or you might encounter CUDA '
                            f'out of memory error. You can reference '
                            f'the mem_required field in the '
                            f'config_all.yaml file.')
-        if op_proc < 1.0:
-            logger.warning(f'The required cuda memory:{mem_required}GB might '
-                           f'be more than the available cuda memory:'
-                           f'{cuda_mem_available}GB.'
-                           f'This Op[{name}] might '
-                           f'require more resource to run.')
+        else:
+            auto_num_proc = math.floor(
+                cuda_mem_available / mem_required) * cuda_device_count()
+            if cuda_mem_available / mem_required < 1.0:
+                logger.warning(
+                    f'The required cuda memory:{mem_required}GB might '
+                    f'be more than the available cuda memory:'
+                    f'{cuda_mem_available}GB.'
+                    f'This Op[{name}] might '
+                    f'require more resource to run.')
+
+        if auto_num_proc and num_proc:
+            op_proc = min(auto_num_proc, num_proc)
+            if num_proc > auto_num_proc:
+                logger.warning(
+                    f'The given num_proc: {num_proc} is greater than '
+                    f'the value {auto_num_proc} auto calculated based '
+                    f'on the mem_required of Op[{name}]. '
+                    f'Set the `num_proc` to {auto_num_proc}.')
+        elif not auto_num_proc and not num_proc:
+            op_proc = cuda_device_count()
+            logger.warning(
+                f'Both mem_required and num_proc of Op[{name}] are not set.'
+                f'Set the `num_proc` to {op_proc}.')
+        else:
+            op_proc = auto_num_proc if auto_num_proc else num_proc
+
         op_proc = max(op_proc, 1)
         return op_proc
     else:
+        if num_proc is None:
+            num_proc = psutil.cpu_count()
+
         op_proc = num_proc
         cpu_available = psutil.cpu_count()
         mem_available = psutil.virtual_memory().available