Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 136 additions & 0 deletions examples/Perkey-Keyphrases/PerkeyParser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import json
import os
import sys
sys.path.insert(0, r'./')
from tqdm.auto import tqdm
from dataclasses import dataclass, field
import time

from configs import Config
from translator import DataParser, VerboseCallback

PARSER_NAME = "PerkeyKeyphrases"

@dataclass
class PerkeyConfig(Config):
qas_id: str = None
title: str = None
summary: str = None
body: str = None
keyphrases: list = field(default_factory=list)
category: str = None
url: str = None

@classmethod
def get_keys(cls):
return ['qas_id', 'title', 'summary', 'body', 'keyphrases', 'category', 'url']


class PerkeyKeyphrasesParser(DataParser):
def __init__(self, file_path: str, output_path: str, target_lang: str="en",
max_example_per_thread=1000, large_chunks_threshold=100000):
super().__init__(file_path, output_path,
parser_name=PARSER_NAME,
target_config=PerkeyConfig,
target_fields=['title', 'summary', 'body', 'keyphrases', 'category'],
do_translate=True,
no_translated_code=False,
verbose=False,
source_lang="fa",
target_lang=target_lang,
max_example_per_thread=max_example_per_thread,
large_chunks_threshold=large_chunks_threshold,
parser_callbacks=[VerboseCallback])

def read(self) -> None:
super(PerkeyKeyphrasesParser, self).read()

with open(self.file_path, 'r', encoding='utf-8') as f:
self.data_read = json.load(f)

print(f"Read {len(self.data_read)} records from {self.file_path}")
return None

def convert(self) -> None:
super(PerkeyKeyphrasesParser, self).convert()

data_converted = []
for data in tqdm(self.data_read, desc="Converting data"):
data_dict = {}

data_dict['qas_id'] = self.id_generator()

data_dict['title'] = data.get('title', '')
data_dict['summary'] = data.get('summary', '')
data_dict['body'] = data.get('body', '')
data_dict['keyphrases'] = data.get('keyphrases', [])
data_dict['category'] = data.get('category', '')

data_dict['url'] = data.get('url', '')

data_converted.append(data_dict)

self.converted_data = data_converted
print(f"Converted {len(self.converted_data)} records")

return None

def post_process(self):
"""Convert the translated data back to the original dataset structure"""
if not hasattr(self, 'converted_data_translated') or not self.converted_data_translated:
print("No translated data available to save in original format")
return

original_format = []
for item in self.converted_data_translated:
entry = {
"title": item.get('title', ''),
"category": item.get('category', ''),
"keyphrases": item.get('keyphrases', []),
"body": item.get('body', ''),
"summary": item.get('summary', ''),
"url": item.get('url', '')
}
original_format.append(entry)

time_now = time.strftime("%Y%m%d-%H%M%S")

output_path = os.path.join(self.output_dir, f'{self.parser_name}_translated_original_format_{time_now}.json')
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(original_format, f, ensure_ascii=False, indent=4)

print(f"Original format translated data saved to {output_path}")

if original_format:
print("\nSample of translated data in original format:")
print(json.dumps(original_format[0], ensure_ascii=False, indent=2))


def process_file(input_path: str, output_dir: str, target_lang: str):
os.makedirs(output_dir, exist_ok=True)

parser = PerkeyKeyphrasesParser(
file_path=input_path,
output_path=output_dir,
target_lang=target_lang
)
parser.read()
parser.convert()

try:
parser.save
parser.post_process()
except Exception as e:
print(f"Error during saving or post-processing: {e}")
import traceback
traceback.print_exc()

if __name__ == '__main__':
# Process file - translating from Persian (fa) to English (en) - github.com/edoost/perkey/
# process_file("./data/data.dev.json", "./data", target_lang="en")
process_file("./data/data.test.json", "./data", target_lang="en")
process_file("./data/data.train.json", "./data", target_lang="en")
# process_file("./data/datasmall.dev.json", "./data", target_lang="en")


print("\nTranslation completed. Check the output files in the ./data directory.")
11 changes: 11 additions & 0 deletions examples/Perkey-Keyphrases/structure.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[
{
"title": String,
"category": String,
"summary": String,
"body": String,
"keyphrases": Array of String,
"url": String
},
...
]
2 changes: 1 addition & 1 deletion install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ if [ -z "$CEREBRAS_API_KEY" ]; then
echo "CEREBRAS_API_KEY environment variable is not set. Please set it to your project's CEREBRAS API key. to use the CEREBRAS provider."
fi

pip install httpx==1.0.0.beta0 --force-reinstall # Must be last to avoid conflicts with other dependencies
pip install httpx==0.28.1 --force-reinstall # Must be last to avoid conflicts with other dependencies

python string_ops/build.py

Expand Down
85 changes: 78 additions & 7 deletions providers/google_provider.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
import sys
import asyncio
from typing import Union, List
sys.path.insert(0, r'/')
from googletrans import Translator
from googletrans.models import Translated
import platform

if platform.system() == "Windows":
print("Setting WindowsSelectorEventLoopPolicy for asyncio on Windows")
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

try:
from .base_provider import Provider
Expand All @@ -14,12 +20,20 @@
class GoogleProvider(Provider):
def __init__(self):
self.translator = Translator()
# Store a shared event loop for reuse
try:
self._loop = asyncio.get_event_loop()
if self._loop.is_closed():
self._loop = asyncio.new_event_loop()
asyncio.set_event_loop(self._loop)
except RuntimeError:
self._loop = asyncio.new_event_loop()
asyncio.set_event_loop(self._loop)

def extract_texts(self, obj: Union[Translated, List[Translated]]) -> Union[str, List[str]]:
def extract_texts(self, obj: Union[Translated, List[Translated], None]) -> Union[str, List[str]]:
"""
Extract .text attribute from Translated object
"""

if isinstance(obj, list):
return [self.extract_texts(item) for item in obj]
else:
Expand Down Expand Up @@ -47,19 +61,76 @@ def _do_translate(self, input_data: Union[str, List[str]],

Return type: list (when a list is passed) else Translated object
"""

data_type = "list" if isinstance(input_data, list) else "str"

# Skip translation if source and destination languages are the same
if src == dest:
print(f"Warning: Source language ({src}) and destination language ({dest}) are the same. Skipping translation.")
return input_data

try:
return self.extract_texts(self.translator.translate(input_data, src=src, dest=dest))
# Run the translation in the shared event loop
if data_type == "list":
async def retry_translate(text, max_attempts=2):
for attempt in range(max_attempts):
try:
return await self.translator.translate(text, src=src, dest=dest)
except Exception as e:
if attempt < max_attempts - 1:
delay = 0.5 * (2 ** attempt) # Exponential backoff: 0.5s, 1s, 2s, ...
print(f"Translation attempt {attempt+1} failed, retrying in {delay}s: {e}")
await asyncio.sleep(delay)
else:
print(f"All {max_attempts} translation attempts failed: {e}")
return None

async def translate_list():
results = []
for text in input_data:
result = await retry_translate(text)
results.append(result)
return results

if self._loop.is_running():
future = asyncio.run_coroutine_threadsafe(translate_list(), self._loop)
translated = future.result(30) # 30 second timeout
else:
translated = self._loop.run_until_complete(translate_list())

# Filter out None values and extract text
valid_results = [item for item in translated if item is not None]
if not valid_results:
return [fail_translation_code] * len(input_data)

return self.extract_texts(valid_results)
else:
async def translate_single():
return await self.translator.translate(input_data, src=src, dest=dest)

if self._loop.is_running():
future = asyncio.run_coroutine_threadsafe(translate_single(), self._loop)
translated = future.result(30) # 30 second timeout
else:
translated = self._loop.run_until_complete(translate_single())

return self.extract_texts(translated)
# TypeError likely due to gender-specific translation, which has no fix yet. Please refer to
# ssut/py-googletrans#260 for more info
except TypeError:
if data_type == "list": return [fail_translation_code, fail_translation_code]
except TypeError as e:
# print(f"Translation TypeError: {e}")
if data_type == "list":
return [fail_translation_code] * len(input_data)
return fail_translation_code

except Exception as e:
# print(f"Translation error: {e}")
if data_type == "list":
return [fail_translation_code] * len(input_data)
return fail_translation_code


if __name__ == '__main__':
test = GoogleProvider()
print(test.translate(["Hello", "How are you today ?"], src="en", dest="vi"))
print(test.translate("Hello", src="en", dest="vi"))
print(test.translate("Hello", src="en", dest="vi"))
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
googletrans==3.1.0a0
googletrans==4.0.2
translators
datasets
tqdm==4.66.1
Expand Down