You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardexpand all lines: configs/config_all.yaml
+10
Original file line number
Diff line number
Diff line change
@@ -15,6 +15,7 @@ text_keys: 'text' # the key name of fi
15
15
suffixes: [] # the suffix of files that will be read. For example: '.txt', 'txt' or ['txt', '.pdf', 'docx']
16
16
use_cache: true # whether to use the cache management of Hugging Face datasets. It might take up lots of disk space when using cache
17
17
ds_cache_dir: null # cache dir for Hugging Face datasets. In default, it\'s the same as the environment variable `HF_DATASETS_CACHE`, whose default value is usually "~/.cache/huggingface/datasets". If this argument is set to a valid path by users, it will override the default cache dir
18
+
open_monitor: true # Whether to open the monitor to trace resource utilization for each OP during data processing. It\'s True in default.
18
19
use_checkpoint: false # whether to use the checkpoint management to save the latest version of dataset to work dir when processing. Rerun the same config will reload the checkpoint and skip ops before it. Cache will be disabled when using checkpoint. If args of ops before the checkpoint are changed, all ops will be rerun from the beginning.
19
20
temp_dir: null # the path to the temp directory to store intermediate caches when cache is disabled, these cache files will be removed on-the-fly. In default, it's None, so the temp dir will be specified by system. NOTICE: you should be caution when setting this argument because it might cause unexpected program behaviors when this path is set to an unsafe directory.
20
21
open_tracer: false # whether to open the tracer to trace the changes during process. It might take more time when opening tracer
@@ -211,6 +212,7 @@ process:
211
212
radius: 2# radius of blur kernel
212
213
- image_tagging_mapper: # Mapper to generate image tags.
213
214
tag_field_name: '__dj__image_tags__'# the field name to store the tags. It's "__dj__image_tags__" in default.
215
+
mem_required: '9GB'
214
216
- nlpaug_en_mapper: # simply augment texts in English based on the nlpaug library
215
217
sequential: false # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently.
216
218
aug_num: 1# number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated.
@@ -257,6 +259,12 @@ process:
257
259
model_params: {} # Parameters for initializing the API model.
258
260
sampling_params: {} # Extra parameters passed to the API call.
259
261
- punctuation_normalization_mapper: # normalize unicode punctuations to English punctuations.
262
+
- python_python_mapper: # executing Python lambda function defined in a file.
263
+
file_path: ''# The path to the Python file containing the function to be executed.
264
+
function_name: 'process_single'# The name of the function defined in the file to be executed.
265
+
- python_lambda_mapper: # executing Python lambda function on data samples.
266
+
lambda_str: ''# A string representation of the lambda function to be executed on data samples. If empty, the identity function is used.
267
+
batched: False # A boolean indicating whether to process input data in batches.
260
268
- remove_bibliography_mapper: # remove bibliography from Latex text.
261
269
- remove_comments_mapper: # remove comments from Latex text, code, etc.
262
270
doc_type: tex # comment type you want to remove. Only support 'tex' for now.
@@ -375,6 +383,7 @@ process:
375
383
frame_sampling_method: 'all_keyframes'# sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
376
384
frame_num: 3# the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
377
385
tag_field_name: '__dj__video_frame_tags__'# the field name to store the tags. It's "__dj__video_frame_tags__" in default.
386
+
mem_required: '9GB'
378
387
- whitespace_normalization_mapper: # normalize different kinds of whitespaces to English whitespace.
379
388
380
389
# Filter ops
@@ -607,6 +616,7 @@ process:
607
616
frame_num: 3# the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
608
617
tag_field_name: '__dj__video_frame_tags__'# the field name to store the tags. It's "__dj__video_frame_tags__" in default.
609
618
any_or_all: any # keep this sample when any/all videos meet the filter condition
619
+
mem_required: '9GB'
610
620
- words_num_filter: # filter text with number of words out of specific range
611
621
lang: en # sample in which language
612
622
tokenization: false # whether to use model to tokenize documents
0 commit comments