modelscope
diff --git a/‎data_juicer/utils/asset_utils.py
+2 b/‎data_juicer/utils/asset_utils.py
+2
diff --git a/‎data_juicer/utils/compress.py
+3-3 b/‎data_juicer/utils/compress.py
+3-3
diff --git a/‎data_juicer/utils/mm_utils.py
+3-3 b/‎data_juicer/utils/mm_utils.py
+3-3
diff --git a/‎data_juicer/utils/registry.py
+1-4 b/‎data_juicer/utils/registry.py
+1-4
diff --git a/‎tests/run.py
+1 b/‎tests/run.py
+1
diff --git a/‎tests/utils/test_asset_utils.py
+57 b/‎tests/utils/test_asset_utils.py
+57
diff --git a/‎tests/utils/test_auto_install_mapping.py
+12 b/‎tests/utils/test_auto_install_mapping.py
+12
diff --git a/‎tests/utils/test_auto_install_utils.py
+24 b/‎tests/utils/test_auto_install_utils.py
+24
diff --git a/‎tests/utils/test_availablility_utils.py
+23 b/‎tests/utils/test_availablility_utils.py
+23
diff --git a/‎tests/utils/test_cache_utils.py
+36 b/‎tests/utils/test_cache_utils.py
+36
diff --git a/‎tests/utils/test_ckpt_utils.py
+81 b/‎tests/utils/test_ckpt_utils.py
+81
diff --git a/‎tests/utils/test_common_utils.py
+57 b/‎tests/utils/test_common_utils.py
+57
@@ -48,6 +48,8 @@ def load_words_asset(words_dir: str, words_type: str):
         logger.info(f'Specified {words_dir} does not contain '
                     f'any {words_type} files in json format, now '
                     'download the one cached by data_juicer team')
+        if words_type not in ASSET_LINKS:
+            raise ValueError(f'{words_type} is not in remote server.')
         response = requests.get(ASSET_LINKS[words_type])
         words_dict = response.json()
         # cache the asset file locally
 
@@ -23,8 +23,8 @@ class FileLock(HF_FileLock):
     def _release(self):
         super()._release()
         try:
-            # logger.debug(f'Remove {self._lock_file}')
-            os.remove(self._lock_file)
+            # logger.debug(f'Remove {self.lock_file}')
+            os.remove(self.lock_file)
         # The file is already deleted and that's what we want.
         except OSError:
             pass
@@ -497,4 +497,4 @@ def decompress(ds, fingerprints=None, num_proc=1):
 
 
 def cleanup_compressed_cache_files(ds):
-    CacheCompressManager().cleanup_cache_files(ds)
+    CacheCompressManager(cache_utils.CACHE_COMPRESS).cleanup_cache_files(ds)
@@ -160,9 +160,9 @@ def iou(box1, box2):
     ix_max = min(x1_max, x2_max)
     iy_min = max(y1_min, y2_min)
     iy_max = min(y1_max, y2_max)
-    intersection = max(0, (ix_max - ix_min) * (iy_max - iy_min))
+    intersection = max(0, max(0, ix_max - ix_min) * max(0, iy_max - iy_min))
     union = area1 + area2 - intersection
-    return 1.0 * intersection / union
+    return 1.0 * intersection / union if union != 0 else 0.0
 
 
 def calculate_resized_dimensions(
@@ -207,7 +207,7 @@ def calculate_resized_dimensions(
 
     # Determine final dimensions based on original orientation
     resized_dimensions = ((new_short_edge,
-                           new_long_edge) if width <= height else
+                           new_long_edge) if width >= height else
                           (new_long_edge, new_short_edge))
 
     # Ensure final dimensions are divisible by the specified value
 
@@ -17,8 +17,6 @@
 #  https://github.com/modelscope/modelscope/blob/master/modelscope/utils/registry.py
 # --------------------------------------------------------
 
-from loguru import logger
-
 
 class Registry(object):
     """This class is used to register some modules to registry by a repo
@@ -53,8 +51,7 @@ def modules(self):
 
     def list(self):
         """Logging the list of module in current registry."""
-        for m in self._modules.keys():
-            logger.info(f'{self._name}\t{m}')
+        return list(self._modules.keys())
 
     def get(self, module_key):
         """
 
@@ -97,6 +97,7 @@ def main():
     runner = unittest.TextTestRunner()
     test_suite = gather_test_cases(os.path.abspath(args.test_dir),
                                    args.pattern, args.tag, args.mode)
+    logger.info(f'There are {len(test_suite._tests)} test cases to run.')
     res = runner.run(test_suite)
 
     cov.stop()
 
@@ -0,0 +1,57 @@
+import os
+import json
+import unittest
+
+from data_juicer.utils.asset_utils import load_words_asset
+
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
+
+class LoadWordsAssetTest(DataJuicerTestCaseBase):
+
+    def setUp(self) -> None:
+        self.temp_output_path = 'tmp/test_asset_utils/'
+
+    def tearDown(self):
+        if os.path.exists(self.temp_output_path):
+            os.system(f'rm -rf {self.temp_output_path}')
+
+    def test_basic_func(self):
+        # download assets from the remote server
+        words_dict = load_words_asset(self.temp_output_path, 'stopwords')
+        self.assertTrue(len(words_dict) > 0)
+        self.assertTrue(os.path.exists(os.path.join(self.temp_output_path, 'stopwords.json')))
+
+        words_dict = load_words_asset(self.temp_output_path, 'flagged_words')
+        self.assertTrue(len(words_dict) > 0)
+        self.assertTrue(os.path.exists(os.path.join(self.temp_output_path, 'flagged_words.json')))
+
+        # non-existing asset
+        with self.assertRaises(ValueError):
+            load_words_asset(self.temp_output_path, 'non_existing_asset')
+
+    def test_load_from_existing_file(self):
+        os.makedirs(self.temp_output_path, exist_ok=True)
+        temp_asset = os.path.join(self.temp_output_path, 'temp_asset.json')
+        with open(temp_asset, 'w') as fout:
+            json.dump({'test_key': ['test_val']}, fout)
+
+        words_list = load_words_asset(self.temp_output_path, 'temp_asset')
+        self.assertEqual(len(words_list), 1)
+        self.assertEqual(len(words_list['test_key']), 1)
+
+    def test_load_from_serial_files(self):
+        os.makedirs(self.temp_output_path, exist_ok=True)
+        temp_asset = os.path.join(self.temp_output_path, 'temp_asset_v1.json')
+        with open(temp_asset, 'w') as fout:
+            json.dump({'test_key': ['test_val_1']}, fout)
+        temp_asset = os.path.join(self.temp_output_path, 'temp_asset_v2.json')
+        with open(temp_asset, 'w') as fout:
+            json.dump({'test_key': ['test_val_2']}, fout)
+
+        words_list = load_words_asset(self.temp_output_path, 'temp_asset')
+        self.assertEqual(len(words_list), 1)
+        self.assertEqual(len(words_list['test_key']), 2)
+
+
+if __name__ == '__main__':
+    unittest.main()
@@ -0,0 +1,12 @@
+import unittest
+
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
+
+class AutoInstallMappingTest(DataJuicerTestCaseBase):
+
+    def test_placeholder(self):
+        pass
+
+
+if __name__ == '__main__':
+    unittest.main()
@@ -0,0 +1,24 @@
+import unittest
+
+from data_juicer.utils.auto_install_utils import _is_module_installed, _is_package_installed
+
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
+
+class IsXXXInstalledFuncsTest(DataJuicerTestCaseBase):
+
+    def test_is_module_installed(self):
+        self.assertTrue(_is_module_installed('datasets'))
+        self.assertTrue(_is_module_installed('simhash'))
+
+        self.assertFalse(_is_module_installed('non_existent_module'))
+
+    def test_is_package_installed(self):
+        self.assertTrue(_is_package_installed('datasets'))
+        self.assertTrue(_is_package_installed('ram@git+https://github.com/xinyu1205/recognize-anything.git'))
+        self.assertTrue(_is_package_installed('scenedetect[opencv]'))
+
+        self.assertFalse(_is_package_installed('non_existent_package'))
+
+
+if __name__ == '__main__':
+    unittest.main()
@@ -0,0 +1,23 @@
+import unittest
+
+from data_juicer.utils.availability_utils import _is_package_available
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
+
+class AvailabilityUtilsTest(DataJuicerTestCaseBase):
+
+    def test_is_package_available(self):
+        exist = _is_package_available('fsspec')
+        self.assertTrue(exist)
+        exist, version = _is_package_available('fsspec', return_version=True)
+        self.assertTrue(exist)
+        self.assertEqual(version, '2023.5.0')
+
+        exist = _is_package_available('non_existing_package')
+        self.assertFalse(exist)
+        exist, version = _is_package_available('non_existing_package', return_version=True)
+        self.assertFalse(exist)
+        self.assertEqual(version, 'N/A')
+
+
+if __name__ == '__main__':
+    unittest.main()
@@ -0,0 +1,36 @@
+import unittest
+
+import datasets
+
+from data_juicer.utils.cache_utils import DatasetCacheControl, dataset_cache_control
+
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
+
+class DatasetCacheControlTest(DataJuicerTestCaseBase):
+
+    def test_basic_func(self):
+        self.assertTrue(datasets.is_caching_enabled())
+        with DatasetCacheControl(on=False):
+            self.assertFalse(datasets.is_caching_enabled())
+        self.assertTrue(datasets.is_caching_enabled())
+
+        with DatasetCacheControl(on=False):
+            self.assertFalse(datasets.is_caching_enabled())
+            with DatasetCacheControl(on=True):
+                self.assertTrue(datasets.is_caching_enabled())
+            self.assertFalse(datasets.is_caching_enabled())
+        self.assertTrue(datasets.is_caching_enabled())
+
+    def test_decorator(self):
+
+        @dataset_cache_control(on=False)
+        def check():
+            return datasets.is_caching_enabled()
+
+        self.assertTrue(datasets.is_caching_enabled())
+        self.assertFalse(check())
+        self.assertTrue(datasets.is_caching_enabled())
+
+
+if __name__ == '__main__':
+    unittest.main()
@@ -0,0 +1,81 @@
+import os
+import unittest
+import json
+
+from data_juicer.core.data import NestedDataset
+from data_juicer.utils.ckpt_utils import CheckpointManager
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
+
+class CkptUtilsTest(DataJuicerTestCaseBase):
+
+    def setUp(self) -> None:
+        self.temp_output_path = 'tmp/test_ckpt_utils/'
+
+    def tearDown(self):
+        if os.path.exists(self.temp_output_path):
+            os.system(f'rm -rf {self.temp_output_path}')
+
+    def test_basic_func(self):
+        ckpt_path = os.path.join(self.temp_output_path, 'ckpt_1')
+        manager = CheckpointManager(ckpt_path, original_process_list=[
+            {'test_op_1': {'test_key': 'test_value_1'}},
+            {'test_op_2': {'test_key': 'test_value_2'}},
+        ])
+        self.assertEqual(manager.get_left_process_list(), [
+            {'test_op_1': {'test_key': 'test_value_1'}},
+            {'test_op_2': {'test_key': 'test_value_2'}},
+        ])
+        self.assertFalse(manager.ckpt_available)
+
+        self.assertFalse(manager.check_ckpt())
+        os.makedirs(ckpt_path, exist_ok=True)
+        os.makedirs(os.path.join(ckpt_path, 'latest'), exist_ok=True)
+        with open(os.path.join(ckpt_path, 'ckpt_op.json'), 'w') as fout:
+            json.dump([
+                {'test_op_1': {'test_key': 'test_value_1'}},
+            ], fout)
+        self.assertTrue(manager.check_ops_to_skip())
+
+        manager = CheckpointManager(ckpt_path, original_process_list=[
+            {'test_op_1': {'test_key': 'test_value_1'}},
+            {'test_op_2': {'test_key': 'test_value_2'}},
+        ])
+        with open(os.path.join(ckpt_path, 'ckpt_op.json'), 'w') as fout:
+            json.dump([
+                {'test_op_1': {'test_key': 'test_value_1'}},
+                {'test_op_2': {'test_key': 'test_value_2'}},
+            ], fout)
+        self.assertFalse(manager.check_ops_to_skip())
+
+    def test_different_ops(self):
+        ckpt_path = os.path.join(self.temp_output_path, 'ckpt_2')
+        os.makedirs(ckpt_path, exist_ok=True)
+        os.makedirs(os.path.join(ckpt_path, 'latest'), exist_ok=True)
+        with open(os.path.join(ckpt_path, 'ckpt_op.json'), 'w') as fout:
+            json.dump([
+                {'test_op_2': {'test_key': 'test_value_2'}},
+            ], fout)
+        manager = CheckpointManager(ckpt_path, original_process_list=[
+            {'test_op_1': {'test_key': 'test_value_1'}},
+            {'test_op_2': {'test_key': 'test_value_2'}},
+        ])
+        self.assertFalse(manager.ckpt_available)
+
+    def test_save_and_load_ckpt(self):
+        ckpt_path = os.path.join(self.temp_output_path, 'ckpt_3')
+        test_data = {
+            'text': ['text1', 'text2', 'text3'],
+        }
+        dataset = NestedDataset.from_dict(test_data)
+        manager = CheckpointManager(ckpt_path, original_process_list=[])
+        self.assertFalse(os.path.exists(os.path.join(manager.ckpt_ds_dir, 'dataset_info.json')))
+        manager.record({'test_op_1': {'test_key': 'test_value_1'}})
+        manager.save_ckpt(dataset)
+        self.assertTrue(os.path.exists(os.path.join(manager.ckpt_ds_dir, 'dataset_info.json')))
+        self.assertTrue(os.path.exists(manager.ckpt_op_record))
+        loaded_ckpt = manager.load_ckpt()
+        self.assertDatasetEqual(dataset, loaded_ckpt)
+
+
+if __name__ == '__main__':
+    unittest.main()
@@ -0,0 +1,57 @@
+import unittest
+import sys
+
+from data_juicer.utils.common_utils import (
+    stats_to_number, dict_to_hash, nested_access, is_string_list,
+    avg_split_string_list_under_limit, is_float
+)
+
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
+
+class CommonUtilsTest(DataJuicerTestCaseBase):
+
+    def test_stats_to_number(self):
+        self.assertEqual(stats_to_number('1.0'), 1.0)
+        self.assertEqual(stats_to_number([1.0, 2.0, 3.0]), 2.0)
+
+        self.assertEqual(stats_to_number([]), -sys.maxsize)
+        self.assertEqual(stats_to_number(None), -sys.maxsize)
+        self.assertEqual(stats_to_number([], reverse=False), sys.maxsize)
+        self.assertEqual(stats_to_number(None, reverse=False), sys.maxsize)
+
+    def test_dict_to_hash(self):
+        self.assertEqual(len(dict_to_hash({'a': 1, 'b': 2})), 64)
+        self.assertEqual(len(dict_to_hash({'a': 1, 'b': 2}, hash_length=32)), 32)
+
+    def test_nested_access(self):
+        self.assertEqual(nested_access({'a': {'b': 1}}, 'a.b'), 1)
+        self.assertEqual(nested_access({'a': [{'b': 1}]}, 'a.0.b', digit_allowed=True), 1)
+        self.assertEqual(nested_access({'a': [{'b': 1}]}, 'a.0.b', digit_allowed=False), None)
+
+    def test_is_string_list(self):
+        self.assertTrue(is_string_list(['a', 'b', 'c']))
+        self.assertFalse(is_string_list([1, 2, 3]))
+        self.assertFalse(is_string_list(['a', 2, 'c']))
+
+    def test_is_float(self):
+        self.assertTrue(is_float('1.0'))
+        self.assertTrue(is_float(1.0))
+        self.assertTrue(is_float('1e-4'))
+        self.assertFalse(is_float('a'))
+
+    def test_avg_split_string_list_under_limit(self):
+        test_data = [
+            (['a', 'b', 'c'], [1, 2, 3], None, [['a', 'b', 'c']]),
+            (['a', 'b', 'c'], [1, 2, 3], 3, [['a', 'b'], ['c']]),
+            (['a', 'b', 'c'], [1, 2, 3], 2, [['a'], ['b'], ['c']]),
+            (['a', 'b', 'c', 'd', 'e'], [1, 2, 3, 1, 1], 3, [['a', 'b'], ['c'], ['d', 'e']]),
+            (['a', 'b', 'c'], [1, 2], 3, [['a', 'b', 'c']]),
+            (['a', 'b', 'c'], [1, 2, 3], 100, [['a', 'b', 'c']]),
+        ]
+
+        for str_list, token_nums, max_token_num, expected_result in test_data:
+            self.assertEqual(avg_split_string_list_under_limit(str_list, token_nums, max_token_num), expected_result)
+
+
+if __name__ == '__main__':
+    unittest.main()