You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardexpand all lines: configs/config_all.yaml
+7-7
Original file line number
Diff line number
Diff line change
@@ -782,15 +782,15 @@ process:
782
782
- video_deduplicator: # deduplicator to deduplicate samples at document-level using exact matching of videos between documents.
783
783
consider_text: false # whether to consider text hash together with video hash when applying deduplication.
784
784
- ray_video_deduplicator: # the simple video deduplicator that can run on multi-nodes using md5 hashing exact matching method
785
-
redis_host: 'redis_host'# the host of the redis instance
786
-
redis_port: 6380# the port of redis instance, please note that the default port of redis is 6379 which is the same as default port for ray, so we need to modify the default redis config to use it in other port
785
+
backend: 'ray_actor'# the backend for dedup, either 'ray_actor' or 'redis'
786
+
redis_address: 'redis://localhost:6379'# the address of redis server
787
787
- ray_image_deduplicator: # the simple image deduplicator that can deduplicate samples at document-level using exact matching of images between documents.
788
-
redis_host: 'redis_host'# the host of the redis instance
789
-
redis_port: 6380# the port of redis instance, please note that the default port of redis is 6379 which is the same as default port for ray, so we need to modify the default redis config to use it in other port
790
-
method: phash # hash method for image. One of [phash, dhash, whash, ahash]
788
+
backend: 'ray_actor'# the backend for dedup, either 'ray_actor' or 'redis'
789
+
redis_address: 'redis://localhost:6379'# the address of redis server
790
+
method: phash # hash method for image. One of [phash, dhash, whash, ahash]
791
791
- ray_document_deduplicator: # the simple document deduplicator that can run on multi-nodes using md5 hashing exact matching method
792
-
redis_host: 'redis_host'# the host of the redis instance
793
-
redis_port: 6380# the port of redis instance, please note that the default port of redis is 6379 which is the same as default port for ray, so we need to modify the default redis config to use it in other port
792
+
backend: 'ray_actor'# the backend for dedup, either 'ray_actor' or 'redis'
793
+
redis_address: 'redis://localhost:6379'# the address of redis server
794
794
lowercase: false # whether to convert text to lower case
795
795
ignore_non_character: false # whether to ignore non-alphabet characters, including whitespaces, digits, and punctuations
796
796
- ray_bts_minhash_deduplicator: # the document deduplicator that can run on multi-nodes using minhashLSH algorithm
0 commit comments