Skip to content

Commit f030f47

Browse files
committed
fix missing field meta tag on ray mode
1 parent 8679da7 commit f030f47

File tree

1 file changed

+13
-0
lines changed

1 file changed

+13
-0
lines changed

data_juicer/core/ray_data.py

+13
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from data_juicer import cuda_device_count
1111
from data_juicer.core.data import DJDataset
1212
from data_juicer.ops import Deduplicator, Filter, Mapper
13+
from data_juicer.ops.base_op import TAGGING_OPS
1314
from data_juicer.utils.constant import Fields
1415
from data_juicer.utils.lazy_loader import LazyLoader
1516
from data_juicer.utils.process_utils import calculate_np
@@ -108,6 +109,18 @@ def _run_single_op(self, op):
108109
op_proc = calculate_np(op._name, op.mem_required, op.cpu_required,
109110
self.num_proc, op.use_cuda())
110111
num_gpus = get_num_gpus(op, op_proc)
112+
113+
if op._name in TAGGING_OPS.modules and Fields.meta not in self.data.columns(
114+
):
115+
116+
def process_batch_arrow(table: pyarrow.Table):
117+
new_column_data = [{} for _ in range(len(table))]
118+
new_talbe = table.append_column(Fields.meta, [new_column_data])
119+
return new_talbe
120+
121+
self.data = self.data.map_batches(process_batch_arrow,
122+
batch_format='pyarrow')
123+
111124
try:
112125
batch_size = getattr(op, 'batch_size',
113126
1) if op.is_batched_op() else 1

0 commit comments

Comments
 (0)