vTuanpham · mfazrinizar · Sep 6, 2025 · Sep 6, 2025 · Sep 6, 2025 · Sep 6, 2025
diff --git a/examples/Perkey-Keyphrases/PerkeyParser.py b/examples/Perkey-Keyphrases/PerkeyParser.py
@@ -0,0 +1,136 @@
+import json
+import os
+import sys
+sys.path.insert(0, r'./')
+from tqdm.auto import tqdm
+from dataclasses import dataclass, field
+import time
+
+from configs import Config
+from translator import DataParser, VerboseCallback
+
+PARSER_NAME = "PerkeyKeyphrases"
+
+@dataclass
+class PerkeyConfig(Config):
+    qas_id: str = None
+    title: str = None
+    summary: str = None
+    body: str = None
+    keyphrases: list = field(default_factory=list)
+    category: str = None
+    url: str = None
+
+    @classmethod
+    def get_keys(cls):
+        return ['qas_id', 'title', 'summary', 'body', 'keyphrases', 'category', 'url']
+
+
+class PerkeyKeyphrasesParser(DataParser):
+    def __init__(self, file_path: str, output_path: str, target_lang: str="en",
+                 max_example_per_thread=1000, large_chunks_threshold=100000):
+        super().__init__(file_path, output_path,
+                         parser_name=PARSER_NAME,
+                         target_config=PerkeyConfig,  
+                         target_fields=['title', 'summary', 'body', 'keyphrases', 'category'],  
+                         do_translate=True,
+                         no_translated_code=False,  
+                         verbose=False,  
+                         source_lang="fa",
+                         target_lang=target_lang,
+                         max_example_per_thread=max_example_per_thread,
+                         large_chunks_threshold=large_chunks_threshold,
+                         parser_callbacks=[VerboseCallback])
+
+    def read(self) -> None:
+        super(PerkeyKeyphrasesParser, self).read()
+
+        with open(self.file_path, 'r', encoding='utf-8') as f:
+            self.data_read = json.load(f)
+
+        print(f"Read {len(self.data_read)} records from {self.file_path}")
+        return None
+
+    def convert(self) -> None:
+        super(PerkeyKeyphrasesParser, self).convert()
+
+        data_converted = []
+        for data in tqdm(self.data_read, desc="Converting data"):
+            data_dict = {}
+
+            data_dict['qas_id'] = self.id_generator()
+
+            data_dict['title'] = data.get('title', '')
+            data_dict['summary'] = data.get('summary', '')
+            data_dict['body'] = data.get('body', '')
+            data_dict['keyphrases'] = data.get('keyphrases', [])
+            data_dict['category'] = data.get('category', '')
+
+            data_dict['url'] = data.get('url', '')
+
+            data_converted.append(data_dict)
+
+        self.converted_data = data_converted
+        print(f"Converted {len(self.converted_data)} records")
+
+        return None
+
+    def post_process(self):
+        """Convert the translated data back to the original dataset structure"""
+        if not hasattr(self, 'converted_data_translated') or not self.converted_data_translated:
+            print("No translated data available to save in original format")
+            return
+
+        original_format = []
+        for item in self.converted_data_translated:
+            entry = {
+                "title": item.get('title', ''),
+                "category": item.get('category', ''),
+                "keyphrases": item.get('keyphrases', []),
+                "body": item.get('body', ''),
+                "summary": item.get('summary', ''),
+                "url": item.get('url', '')
+            }
+            original_format.append(entry)
+
+        time_now = time.strftime("%Y%m%d-%H%M%S")
+
+        output_path = os.path.join(self.output_dir, f'{self.parser_name}_translated_original_format_{time_now}.json')
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(original_format, f, ensure_ascii=False, indent=4)
+
+        print(f"Original format translated data saved to {output_path}")
+
+        if original_format:
+            print("\nSample of translated data in original format:")
+            print(json.dumps(original_format[0], ensure_ascii=False, indent=2))
+
+
+def process_file(input_path: str, output_dir: str, target_lang: str):
+    os.makedirs(output_dir, exist_ok=True)
+
+    parser = PerkeyKeyphrasesParser(
+        file_path=input_path,
+        output_path=output_dir,
+        target_lang=target_lang
+    )
+    parser.read()
+    parser.convert()
+
+    try:
+        parser.save
+        parser.post_process()
+    except Exception as e:
+        print(f"Error during saving or post-processing: {e}")
+        import traceback
+        traceback.print_exc()
+
+if __name__ == '__main__':
+    # Process file - translating from Persian (fa) to English (en) - github.com/edoost/perkey/
+    # process_file("./data/data.dev.json", "./data", target_lang="en")
+    process_file("./data/data.test.json", "./data", target_lang="en")
+    process_file("./data/data.train.json", "./data", target_lang="en")
+    # process_file("./data/datasmall.dev.json", "./data", target_lang="en")
+
+
+    print("\nTranslation completed. Check the output files in the ./data directory.")
diff --git a/examples/Perkey-Keyphrases/structure.txt b/examples/Perkey-Keyphrases/structure.txt
@@ -0,0 +1,11 @@
+[
+    {
+        "title": String,
+        "category": String,
+        "summary": String,
+        "body": String,
+        "keyphrases": Array of String,
+        "url": String
+    },
+    ...
+]
diff --git a/install.sh b/install.sh
@@ -14,7 +14,7 @@ if [ -z "$CEREBRAS_API_KEY" ]; then
   echo "CEREBRAS_API_KEY environment variable is not set. Please set it to your project's CEREBRAS API key. to use the CEREBRAS provider."
 fi
 
-pip install httpx==1.0.0.beta0 --force-reinstall # Must be last to avoid conflicts with other dependencies
+pip install httpx==0.28.1 --force-reinstall # Must be last to avoid conflicts with other dependencies
 
 python string_ops/build.py
 

diff --git a/providers/google_provider.py b/providers/google_provider.py
@@ -1,8 +1,14 @@
 import sys
+import asyncio
 from typing import Union, List
 sys.path.insert(0, r'/')
 from googletrans import Translator
 from googletrans.models import Translated
+import platform
+
+if platform.system() == "Windows":
+    print("Setting WindowsSelectorEventLoopPolicy for asyncio on Windows")
+    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
 
 try:
     from .base_provider import Provider
@@ -14,12 +20,20 @@
 class GoogleProvider(Provider):
     def __init__(self):
         self.translator = Translator()
+        # Store a shared event loop for reuse
+        try:
+            self._loop = asyncio.get_event_loop()
+            if self._loop.is_closed():
+                self._loop = asyncio.new_event_loop()
+                asyncio.set_event_loop(self._loop)
+        except RuntimeError:
+            self._loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(self._loop)
 
-    def extract_texts(self, obj: Union[Translated, List[Translated]]) -> Union[str, List[str]]:
+    def extract_texts(self, obj: Union[Translated, List[Translated], None]) -> Union[str, List[str]]:
         """
         Extract .text attribute from Translated object
         """
-
         if isinstance(obj, list):
             return [self.extract_texts(item) for item in obj]
         else:
@@ -47,19 +61,76 @@ def _do_translate(self, input_data: Union[str, List[str]],
 
             Return type: list (when a list is passed) else Translated object
         """
-
+        
         data_type = "list" if isinstance(input_data, list) else "str"
+
+        # Skip translation if source and destination languages are the same
+        if src == dest:
+            print(f"Warning: Source language ({src}) and destination language ({dest}) are the same. Skipping translation.")
+            return input_data
 
         try:
-            return self.extract_texts(self.translator.translate(input_data, src=src, dest=dest))
+            # Run the translation in the shared event loop
+            if data_type == "list":
+                async def retry_translate(text, max_attempts=2):
+                    for attempt in range(max_attempts):
+                        try:
+                            return await self.translator.translate(text, src=src, dest=dest)
+                        except Exception as e:
+                            if attempt < max_attempts - 1:
+                                delay = 0.5 * (2 ** attempt)  # Exponential backoff: 0.5s, 1s, 2s, ...
+                                print(f"Translation attempt {attempt+1} failed, retrying in {delay}s: {e}")
+                                await asyncio.sleep(delay)
+                            else:
+                                print(f"All {max_attempts} translation attempts failed: {e}")
+                                return None
+
+                async def translate_list():
+                    results = []
+                    for text in input_data:
+                        result = await retry_translate(text)
+                        results.append(result)
+                    return results
+
+                if self._loop.is_running():
+                    future = asyncio.run_coroutine_threadsafe(translate_list(), self._loop)
+                    translated = future.result(30)  # 30 second timeout
+                else:
+                    translated = self._loop.run_until_complete(translate_list())
+
+                # Filter out None values and extract text
+                valid_results = [item for item in translated if item is not None]
+                if not valid_results:
+                    return [fail_translation_code] * len(input_data)
+
+                return self.extract_texts(valid_results)
+            else:
+                async def translate_single():
+                    return await self.translator.translate(input_data, src=src, dest=dest)
+
+                if self._loop.is_running():
+                    future = asyncio.run_coroutine_threadsafe(translate_single(), self._loop)
+                    translated = future.result(30)  # 30 second timeout
+                else:
+                    translated = self._loop.run_until_complete(translate_single())
+
+                return self.extract_texts(translated)
         # TypeError likely due to gender-specific translation, which has no fix yet. Please refer to
         # ssut/py-googletrans#260 for more info
-        except TypeError:
-            if data_type == "list": return [fail_translation_code, fail_translation_code]
+        except TypeError as e:
+            # print(f"Translation TypeError: {e}")
+            if data_type == "list": 
+                return [fail_translation_code] * len(input_data)
+            return fail_translation_code
+
+        except Exception as e:
+            # print(f"Translation error: {e}")
+            if data_type == "list": 
+                return [fail_translation_code] * len(input_data)
             return fail_translation_code
 
 
 if __name__ == '__main__':
     test = GoogleProvider()
     print(test.translate(["Hello", "How are you today ?"], src="en", dest="vi"))
-    print(test.translate("Hello", src="en", dest="vi"))
+    print(test.translate("Hello", src="en", dest="vi"))
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-googletrans==3.1.0a0
+googletrans==4.0.2
 translators
 datasets
 tqdm==4.66.1