From 801f804bed50a379f4f82a73c161504cc2a2f9cb Mon Sep 17 00:00:00 2001 From: "M. Fazri Nizar" Date: Sat, 6 Sep 2025 21:18:10 +0700 Subject: [PATCH 1/5] chore: update old googletrans dependency since it contradicts with httpx version required by other deps (groq) --- install.sh | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/install.sh b/install.sh index 6202ef8..2b5beb8 100644 --- a/install.sh +++ b/install.sh @@ -14,7 +14,7 @@ if [ -z "$CEREBRAS_API_KEY" ]; then echo "CEREBRAS_API_KEY environment variable is not set. Please set it to your project's CEREBRAS API key. to use the CEREBRAS provider." fi -pip install httpx==1.0.0.beta0 --force-reinstall # Must be last to avoid conflicts with other dependencies +pip install httpx==0.28.1 --force-reinstall # Must be last to avoid conflicts with other dependencies python string_ops/build.py diff --git a/requirements.txt b/requirements.txt index 1fe763a..cd1612e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -googletrans==3.1.0a0 +googletrans==4.0.2 translators datasets tqdm==4.66.1 From ae3d062dc250266a6ff0c78b6b4fa5a924a1a275 Mon Sep 17 00:00:00 2001 From: "M. Fazri Nizar" Date: Sat, 6 Sep 2025 21:41:59 +0700 Subject: [PATCH 2/5] refactor: migrate GoogleProvider to googletrans 4.0.2 with async support --- providers/google_provider.py | 76 ++++++++++++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 7 deletions(-) diff --git a/providers/google_provider.py b/providers/google_provider.py index 01946b7..02fb880 100644 --- a/providers/google_provider.py +++ b/providers/google_provider.py @@ -1,8 +1,14 @@ import sys +import asyncio from typing import Union, List sys.path.insert(0, r'/') from googletrans import Translator from googletrans.models import Translated +import platform + +if platform.system() == "Windows": + print("Setting WindowsSelectorEventLoopPolicy for asyncio on Windows") + asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) try: from .base_provider import Provider @@ -14,12 +20,20 @@ class GoogleProvider(Provider): def __init__(self): self.translator = Translator() + # Store a shared event loop for reuse + try: + self._loop = asyncio.get_event_loop() + if self._loop.is_closed(): + self._loop = asyncio.new_event_loop() + asyncio.set_event_loop(self._loop) + except RuntimeError: + self._loop = asyncio.new_event_loop() + asyncio.set_event_loop(self._loop) - def extract_texts(self, obj: Union[Translated, List[Translated]]) -> Union[str, List[str]]: + def extract_texts(self, obj: Union[Translated, List[Translated], None]) -> Union[str, List[str]]: """ Extract .text attribute from Translated object """ - if isinstance(obj, list): return [self.extract_texts(item) for item in obj] else: @@ -47,19 +61,67 @@ def _do_translate(self, input_data: Union[str, List[str]], Return type: list (when a list is passed) else Translated object """ - + data_type = "list" if isinstance(input_data, list) else "str" + + # Skip translation if source and destination languages are the same + if src == dest: + print(f"Warning: Source language ({src}) and destination language ({dest}) are the same. Skipping translation.") + return input_data try: - return self.extract_texts(self.translator.translate(input_data, src=src, dest=dest)) + # Run the translation in the shared event loop + if data_type == "list": + async def translate_list(): + results = [] + for text in input_data: + try: + result = await self.translator.translate(text, src=src, dest=dest) + results.append(result) + except Exception as e: + print(f"Error translating list item: {e}") + results.append(None) + return results + + if self._loop.is_running(): + future = asyncio.run_coroutine_threadsafe(translate_list(), self._loop) + translated = future.result(30) # 30 second timeout + else: + translated = self._loop.run_until_complete(translate_list()) + + # Filter out None values and extract text + valid_results = [item for item in translated if item is not None] + if not valid_results: + return [fail_translation_code] * len(input_data) + + return self.extract_texts(valid_results) + else: + async def translate_single(): + return await self.translator.translate(input_data, src=src, dest=dest) + + if self._loop.is_running(): + future = asyncio.run_coroutine_threadsafe(translate_single(), self._loop) + translated = future.result(30) # 30 second timeout + else: + translated = self._loop.run_until_complete(translate_single()) + + return self.extract_texts(translated) # TypeError likely due to gender-specific translation, which has no fix yet. Please refer to # ssut/py-googletrans#260 for more info - except TypeError: - if data_type == "list": return [fail_translation_code, fail_translation_code] + except TypeError as e: + print(f"Translation TypeError: {e}") + if data_type == "list": + return [fail_translation_code] * len(input_data) + return fail_translation_code + + except Exception as e: + print(f"Translation error: {e}") + if data_type == "list": + return [fail_translation_code] * len(input_data) return fail_translation_code if __name__ == '__main__': test = GoogleProvider() print(test.translate(["Hello", "How are you today ?"], src="en", dest="vi")) - print(test.translate("Hello", src="en", dest="vi")) + print(test.translate("Hello", src="en", dest="vi")) \ No newline at end of file From 07d7d64eb97c666526f7bbf45d4dfb3a4a767425 Mon Sep 17 00:00:00 2001 From: "M. Fazri Nizar" Date: Sat, 6 Sep 2025 21:49:04 +0700 Subject: [PATCH 3/5] feat: add Perkey-Keyphrases dataset translation parser example --- examples/Perkey-Keyphrases/PerkeyParser.py | 136 +++++++++++++++++++++ examples/Perkey-Keyphrases/structure.txt | 11 ++ 2 files changed, 147 insertions(+) create mode 100644 examples/Perkey-Keyphrases/PerkeyParser.py create mode 100644 examples/Perkey-Keyphrases/structure.txt diff --git a/examples/Perkey-Keyphrases/PerkeyParser.py b/examples/Perkey-Keyphrases/PerkeyParser.py new file mode 100644 index 0000000..133edff --- /dev/null +++ b/examples/Perkey-Keyphrases/PerkeyParser.py @@ -0,0 +1,136 @@ +import json +import os +import sys +sys.path.insert(0, r'./') +from tqdm.auto import tqdm +from dataclasses import dataclass, field +import time + +from configs import Config +from translator import DataParser, VerboseCallback + +PARSER_NAME = "PerkeyKeyphrases" + +@dataclass +class PerkeyConfig(Config): + qas_id: str = None + title: str = None + summary: str = None + body: str = None + keyphrases: list = field(default_factory=list) + category: str = None + url: str = None + + @classmethod + def get_keys(cls): + return ['qas_id', 'title', 'summary', 'body', 'keyphrases', 'category', 'url'] + + +class PerkeyKeyphrasesParser(DataParser): + def __init__(self, file_path: str, output_path: str, target_lang: str="en", + max_example_per_thread=1000, large_chunks_threshold=100000): + super().__init__(file_path, output_path, + parser_name=PARSER_NAME, + target_config=PerkeyConfig, + target_fields=['title', 'summary', 'body', 'keyphrases', 'category'], + do_translate=True, + no_translated_code=False, + verbose=False, + source_lang="fa", + target_lang=target_lang, + max_example_per_thread=max_example_per_thread, + large_chunks_threshold=large_chunks_threshold, + parser_callbacks=[VerboseCallback]) + + def read(self) -> None: + super(PerkeyKeyphrasesParser, self).read() + + with open(self.file_path, 'r', encoding='utf-8') as f: + self.data_read = json.load(f) + + print(f"Read {len(self.data_read)} records from {self.file_path}") + return None + + def convert(self) -> None: + super(PerkeyKeyphrasesParser, self).convert() + + data_converted = [] + for data in tqdm(self.data_read, desc="Converting data"): + data_dict = {} + + data_dict['qas_id'] = self.id_generator() + + data_dict['title'] = data.get('title', '') + data_dict['summary'] = data.get('summary', '') + data_dict['body'] = data.get('body', '') + data_dict['keyphrases'] = data.get('keyphrases', []) + data_dict['category'] = data.get('category', '') + + data_dict['url'] = data.get('url', '') + + data_converted.append(data_dict) + + self.converted_data = data_converted + print(f"Converted {len(self.converted_data)} records") + + return None + + def post_process(self): + """Convert the translated data back to the original dataset structure""" + if not hasattr(self, 'converted_data_translated') or not self.converted_data_translated: + print("No translated data available to save in original format") + return + + original_format = [] + for item in self.converted_data_translated: + entry = { + "title": item.get('title', ''), + "category": item.get('category', ''), + "keyphrases": item.get('keyphrases', []), + "body": item.get('body', ''), + "summary": item.get('summary', ''), + "url": item.get('url', '') + } + original_format.append(entry) + + time_now = time.strftime("%Y%m%d-%H%M%S") + + output_path = os.path.join(self.output_dir, f'{self.parser_name}_translated_original_format_{time_now}.json') + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(original_format, f, ensure_ascii=False, indent=4) + + print(f"Original format translated data saved to {output_path}") + + if original_format: + print("\nSample of translated data in original format:") + print(json.dumps(original_format[0], ensure_ascii=False, indent=2)) + + +def process_file(input_path: str, output_dir: str, target_lang: str): + os.makedirs(output_dir, exist_ok=True) + + parser = PerkeyKeyphrasesParser( + file_path=input_path, + output_path=output_dir, + target_lang=target_lang + ) + parser.read() + parser.convert() + + try: + parser.save + parser.post_process() + except Exception as e: + print(f"Error during saving or post-processing: {e}") + import traceback + traceback.print_exc() + +if __name__ == '__main__': + # Process file - translating from Persian (fa) to English (en) - github.com/edoost/perkey/ + # process_file("./data/data.dev.json", "./data", target_lang="en") + process_file("./data/data.test.json", "./data", target_lang="en") + process_file("./data/data.train.json", "./data", target_lang="en") + # process_file("./data/datasmall.dev.json", "./data", target_lang="en") + + + print("\nTranslation completed. Check the output files in the ./data directory.") \ No newline at end of file diff --git a/examples/Perkey-Keyphrases/structure.txt b/examples/Perkey-Keyphrases/structure.txt new file mode 100644 index 0000000..5071292 --- /dev/null +++ b/examples/Perkey-Keyphrases/structure.txt @@ -0,0 +1,11 @@ +[ + { + "title": String, + "category": String, + "summary": String, + "body": String, + "keyphrases": Array of String, + "url": String + }, + ... +] \ No newline at end of file From 36c8f60361c23d96f5d620d7e2c1b98ae222d310 Mon Sep 17 00:00:00 2001 From: "M. Fazri Nizar" Date: Sat, 6 Sep 2025 22:08:11 +0700 Subject: [PATCH 4/5] refactor: comments out translation error --- providers/google_provider.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/providers/google_provider.py b/providers/google_provider.py index 02fb880..0632a55 100644 --- a/providers/google_provider.py +++ b/providers/google_provider.py @@ -109,13 +109,13 @@ async def translate_single(): # TypeError likely due to gender-specific translation, which has no fix yet. Please refer to # ssut/py-googletrans#260 for more info except TypeError as e: - print(f"Translation TypeError: {e}") + # print(f"Translation TypeError: {e}") if data_type == "list": return [fail_translation_code] * len(input_data) return fail_translation_code except Exception as e: - print(f"Translation error: {e}") + # print(f"Translation error: {e}") if data_type == "list": return [fail_translation_code] * len(input_data) return fail_translation_code From 6d94cf5daacd7f4a442879776fa0cee004ecaa49 Mon Sep 17 00:00:00 2001 From: "M. Fazri Nizar" Date: Sun, 7 Sep 2025 23:21:35 +0700 Subject: [PATCH 5/5] feat: add retry mechanism for failed translation attempts in GoogleProvider --- providers/google_provider.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/providers/google_provider.py b/providers/google_provider.py index 0632a55..8832230 100644 --- a/providers/google_provider.py +++ b/providers/google_provider.py @@ -72,15 +72,24 @@ def _do_translate(self, input_data: Union[str, List[str]], try: # Run the translation in the shared event loop if data_type == "list": + async def retry_translate(text, max_attempts=2): + for attempt in range(max_attempts): + try: + return await self.translator.translate(text, src=src, dest=dest) + except Exception as e: + if attempt < max_attempts - 1: + delay = 0.5 * (2 ** attempt) # Exponential backoff: 0.5s, 1s, 2s, ... + print(f"Translation attempt {attempt+1} failed, retrying in {delay}s: {e}") + await asyncio.sleep(delay) + else: + print(f"All {max_attempts} translation attempts failed: {e}") + return None + async def translate_list(): results = [] for text in input_data: - try: - result = await self.translator.translate(text, src=src, dest=dest) - results.append(result) - except Exception as e: - print(f"Error translating list item: {e}") - results.append(None) + result = await retry_translate(text) + results.append(result) return results if self._loop.is_running():