You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
- **Note:** Analyzer only compute stats of Filter ops. So extra Mapper or Deduplicator ops will be ignored in the analysis process.
343
+
- **Note:** Analyzer only compute stats for Filters that produce stats or other OPs that produce tags/categories in meta. So other OPs will be ignored in the analysis process. We use the following registries to decorate OPs:
344
+
- `NON_STATS_FILTERS`: decorate Filters that **DO NOT** produce any stats.
345
+
- `TAGGING_OPS`: decorate OPs that **DO** produce tags/categories in meta field.
- expand_macro_mapper: # expand macro definitions in Latex text.
81
81
- extract_entity_attribute_mapper: # Extract attributes for given entities from the text.
82
+
api_model: 'gpt-4o'# API model name.
82
83
query_entities: ["孙悟空", "猪八戒"] # Entity list to be queried.
83
84
query_attributes: ["人物性格"] # Attribute list to be queried.
84
-
api_model: 'gpt-4o'# API model name.
85
85
entity_key: '__dj__entity__'# The field name to store the given main entity for attribute extraction.
86
86
entity_attribute_key: '__dj__attribute__'# The field name to store the given attribute to be extracted.
87
87
attribute_desc_key: '__dj__attribute_description__'# The field name to store the extracted attribute description.
@@ -153,6 +153,18 @@ process:
153
153
drop_text: false # If drop the text in the output.
154
154
model_params: {} # Parameters for initializing the API model.
155
155
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
156
+
- extract_support_text_mapper: # extract support sub text for a summary.
157
+
api_model: 'gpt-4o'# API model name.
158
+
summary_key: '__dj__event_description__'# The field name to store the input summary. Support for nested keys such as "__dj__stats__.text_len".
159
+
support_text_key: '__dj__support_text__'# The field name to store the output support text for the summary.
160
+
api_endpoint: null # URL endpoint for the API.
161
+
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
162
+
system_prompt: null # System prompt for the task.
163
+
input_template: null # Template for building the model input.
164
+
try_num: 3# The number of retry attempts when there is an API call error or output parsing error.
165
+
drop_text: false # If drop the text in the output.
166
+
model_params: {} # Parameters for initializing the API model.
167
+
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
156
168
- fix_unicode_mapper: # fix unicode errors in text.
157
169
- generate_qa_from_examples_mapper: # mapper to generate question and answer pairs from examples.
158
170
hf_model: 'Qwen/Qwen2.5-7B-Instruct'# Model name on huggingface to generate question and answer pairs.
@@ -259,12 +271,27 @@ process:
259
271
model_params: {} # Parameters for initializing the API model.
260
272
sampling_params: {} # Extra parameters passed to the API call.
261
273
- punctuation_normalization_mapper: # normalize unicode punctuations to English punctuations.
262
-
- python_python_mapper: # executing Python lambda function defined in a file.
274
+
- python_file_mapper: # executing Python lambda function defined in a file.
263
275
file_path: ''# The path to the Python file containing the function to be executed.
264
276
function_name: 'process_single'# The name of the function defined in the file to be executed.
265
277
- python_lambda_mapper: # executing Python lambda function on data samples.
266
278
lambda_str: ''# A string representation of the lambda function to be executed on data samples. If empty, the identity function is used.
267
279
batched: False # A boolean indicating whether to process input data in batches.
280
+
- relation_identity_mapper: # identify relation between two entity in the text.
281
+
api_model: 'gpt-4o'# API model name.
282
+
source_entity: '孙悟空'# The source entity of the relation to be dentified.
283
+
target_entity: '猪八戒'# The target entity of the relation to be identified.
284
+
input_key: null # The input field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is text_key in default.
285
+
output_key: null # The output field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is input_key in default.
286
+
api_endpoint: null # URL endpoint for the API.
287
+
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
288
+
system_prompt_template: null # System prompt template for the task. Need to specify by entity1 and entity2.
289
+
input_template: null # Template for building the model input.
290
+
output_pattern_template: null # Regular expression template for parsing model output.
291
+
try_num: 3# The number of retry attempts when there is an API call error or output parsing error.
292
+
drop_text: false # If drop the text in the output.
293
+
model_params: {} # Parameters for initializing the API model.
294
+
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
268
295
- remove_bibliography_mapper: # remove bibliography from Latex text.
269
296
- remove_comments_mapper: # remove comments from Latex text, code, etc.
270
297
doc_type: tex # comment type you want to remove. Only support 'tex' for now.
@@ -567,7 +594,7 @@ process:
567
594
vertical_flip: false # flip frame image vertically (top to bottom).
568
595
reduce_mode: avg # reduce mode when one text corresponds to multiple videos in a chunk, must be one of ['avg','max', 'min'].
569
596
any_or_all: any # keep this sample when any/all videos meet the filter condition
570
-
mem_required: '1GB'# This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
597
+
mem_required: '1500MB'# This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
571
598
- video_motion_score_filter: # Keep samples with video motion scores within a specific range.
572
599
min_score: 0.25# the minimum motion score to keep samples
573
600
max_score: 10000.0# the maximum motion score to keep samples
@@ -693,3 +720,55 @@ process:
693
720
top_ratio: # ratio of selected top samples
694
721
topk: # number of selected top sample
695
722
reverse: True # determine the sorting rule, if reverse=True, then sort in descending order
723
+
724
+
# Grouper ops.
725
+
- naive_grouper: # Group all samples to one batched sample.
726
+
- key_value_grouper: # Group samples to batched samples according values in given keys.
727
+
group_by_keys: null # Group samples according values in the keys. Support for nested keys such as "__dj__stats__.text_len". It is [self.text_key] in default.
728
+
729
+
# Aggregator ops.
730
+
- entity_attribute_aggregator: # Return conclusion of the given entity's attribute from some docs.
731
+
api_model: 'gpt-4o'# API model name.
732
+
entity: '孙悟空'# The given entity.
733
+
attribute: '人物经历'# The given attribute.
734
+
input_key: null # The input field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is text_key in default.
735
+
output_key: null # The output field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is same as the input_key in default.
736
+
word_limit: 100# Prompt the output length.
737
+
max_token_num: null # The max token num of the total tokens of the sub documents. Without limitation if it is None.
738
+
api_endpoint: null # URL endpoint for the API.
739
+
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
740
+
system_prompt_template: null # System prompt template for the task. Need to be specified by given entity and attribute.
741
+
example_prompt: null # The example part in the system prompt.
742
+
input_template: null # The input template.
743
+
output_pattern_template: null # The output template.
744
+
try_num: 3# The number of retry attempts when there is an API call error or output parsing error.
745
+
model_params: {} # Parameters for initializing the API model.
746
+
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
747
+
- most_relavant_entities_aggregator: # Extract entities closely related to a given entity from some texts, and sort them in descending order of importance.
748
+
api_model: 'gpt-4o'# API model name.
749
+
entity: '孙悟空'# The given entity.
750
+
query_entity_type: '人物'# The type of queried relavant entities.
751
+
input_key: null # The input field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is text_key in default.
752
+
output_key: null # The output field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is same as the input_key in default.
753
+
max_token_num: null # The max token num of the total tokens of the sub documents. Without limitation if it is None.
754
+
api_endpoint: null # URL endpoint for the API.
755
+
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
756
+
system_prompt_template: null # System prompt template for the task. Need to be specified by given entity and entity_type.
757
+
input_template: null # The input template.
758
+
output_pattern: null # The output pattern.
759
+
try_num: 3# The number of retry attempts when there is an API call error or output parsing error.
760
+
model_params: {} # Parameters for initializing the API model.
761
+
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
762
+
- nested_aggregator: # Considering the limitation of input length, nested aggregate contents for each given number of samples.
763
+
api_model: 'gpt-4o'# API model name.
764
+
input_key: null # The input field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is text_key in default.
765
+
output_key: null # The output field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is same as the input_key in default.
766
+
max_token_num: null # The max token num of the total tokens of the sub documents. Without limitation if it is None.
767
+
api_endpoint: null # URL endpoint for the API.
768
+
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
769
+
system_prompt: null # The system prompt.
770
+
sub_doc_template: null # The template for input text in each sample.
771
+
input_template: null # The input template.
772
+
try_num: 3# The number of retry attempts when there is an API call error or output parsing error.
773
+
model_params: {} # Parameters for initializing the API model.
774
+
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
0 commit comments