Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@


class Command(BaseCommand):
help = "Refines machine translations using OpenAI's GPT-4 API with specified characteristics"
help = "Refines machine translations using OpenAI's GPT API with specified characteristics"

def add_arguments(self, parser):
parser.add_argument(
Expand Down
188 changes: 127 additions & 61 deletions pontoon/machinery/openai_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from django.conf import settings
from django.core.cache import cache

from pontoon.base.models import Locale
from pontoon.machinery.utils import (
get_machinery_service_cache_key,
set_machinery_service_cache_key,
Expand All @@ -19,90 +18,157 @@ def __init__(self):
self.client = OpenAI()

def get_translation(
self, english_text, translated_text, characteristic, target_language_name
self,
english_text,
translated_text,
characteristic,
locale,
entity_id=None,
entity_comment=None,
group_comment=None,
resource_comment=None,
pinned_comments=None,
terms=None,
):
terms_cache_key = (
str(sorted((t.get("text", "") for t in terms))) if terms else ""
)
pinned_comments_cache_key = (
str(sorted(pinned_comments)) if pinned_comments else ""
)
cache_key = get_machinery_service_cache_key(
"openai_chatgpt",
english_text,
translated_text,
characteristic,
target_language_name,
locale.code,
entity_id or "",
entity_comment or "",
group_comment or "",
resource_comment or "",
pinned_comments_cache_key,
terms_cache_key,
)
cached = cache.get(cache_key)
if cached is not None:
return cached

try:
target_language = Locale.objects.get(name=target_language_name)
except Locale.DoesNotExist:
raise ValueError(
f"The target language '{target_language_name}' is not supported."
style_goals = {
"informal": f"Use simple, everyday {locale.name} ({locale.code}) — avoid jargon, technical terms, and formal constructions.",
"formal": f"Use formal {locale.name} ({locale.code}) throughout; maintain a consistent register and do not mix formal and informal modes.",
"rephrased": f"Provide an alternative wording that preserves the original meaning; adapt idioms and culturally marked expressions for {locale.name} ({locale.code}); you may restructure sentences but must not introduce new information or omit essential meaning.",
}

style_goal = style_goals.get(characteristic)
if style_goal is None:
raise ValueError(f"Unrecognized characteristic: '{characteristic}'")

# Separate the instruction from the data.
# It makes it hard for injected text to masquerade as instructions.
context_parts = []
if entity_id:
context_parts.append(f"STRING ID:\n{entity_id}")
if resource_comment:
context_parts.append(f"RESOURCE COMMENT:\n{resource_comment}")
if group_comment:
context_parts.append(f"GROUP COMMENT:\n{group_comment}")
if entity_comment:
context_parts.append(f"STRING COMMENT:\n{entity_comment}")
if pinned_comments:
pinned_block = "\n".join(f"- {c}" for c in pinned_comments)
context_parts.append(f"PINNED COMMENTS:\n{pinned_block}")
if terms:
term_lines = []
for term in terms:
text = term.get("text", "")
pos = term.get("part_of_speech", "")
translation = term.get("translation", "")
parts = [f'"{text}"']
if pos:
parts.append(f"({pos})")
if translation:
parts.append(f'→ "{translation}"')
term_lines.append(" ".join(parts))
terms_block = "\n".join(f"- {line}" for line in term_lines)
context_parts.append(
f"TERMINOLOGY:\nThese are terminology matches in the source text that you should consider:\n{terms_block}"
)
context_parts.append(f"ENGLISH SOURCE:\n{english_text}")
context_parts.append(f"MACHINE TRANSLATION (for reference):\n{translated_text}")
user_prompt = "\n\n".join(context_parts)

intro_text = f"Refine the {target_language} machine translation below to make it {characteristic}."

common_rules = textwrap.dedent(
"""Follow these rules IN ORDER OF PRIORITY:
1) ENDING PUNCTUATION — SEMANTICS, NOT LITERAL CHAR:
- Detect the English ending: none, ".", "?", "!", "…".
- The translation MUST express the same ending SEMANTIC:
• if English ends with "?" → translation ends with a question.
• if English ends with "!" → translation ends with an exclamation.
• if English ends with "…" → translation ends with an ellipsis.
• if English has NO closing punctuation → translation MUST NOT end with ".", "?", "!", or "…".
- Do not add a final period if the English has none.
- Respect orthographic and typographic rules of the target language regarding punctuation, like using non-breaking spaces in French, adding opening "¿" or "¡" in Spanish, etc.
2) Preserve all HTML tags and their order. Do not add, remove, or reorder tags."""
)
system_header = textwrap.dedent(
f"""\
You are an expert {locale.name} ({locale.code}) localization specialist.

informal = textwrap.dedent(
f"""{intro_text}
Revise the {target_language} translation to use simpler language.
{common_rules}
3) Clarity and Simplicity: keep wording straightforward and consistent.
Output only the revised translation."""
Your task: produce a {characteristic} {locale.name} ({locale.code}) translation of a UI string.
Use the provided machine translation as a reference, but you are not bound by it — rewrite freely to achieve the best result.
"""
)

formal = textwrap.dedent(
f"""{intro_text}
Revise the {target_language} translation to use a higher level of formality.
{common_rules}
3) Consistency: maintain a consistent level of formality throughout; do not mix formal and informal modes.
4) Preserve all HTML tags and their order. Do not add, remove, or reorder tags.
5) Clarity and Precision: keep wording clear and unambiguous while remaining formal.
Output only the revised translation."""
context_instructions = []
if entity_id:
context_instructions.append(
"STRING ID: use it to infer the UI context (e.g., button, menu item, page title, tooltip) and adapt length and phrasing accordingly."
)
if resource_comment:
context_instructions.append(
"RESOURCE COMMENT: general notes about the file — use it as additional context."
)
if group_comment:
context_instructions.append(
"GROUP COMMENT: notes about the group of messages this string belongs to — use it as additional context."
)
if entity_comment:
context_instructions.append(
"STRING COMMENT: treat it as authoritative translator notes — it may specify placeholders to preserve exactly, terms that must not be translated, or other constraints. STRING COMMENT requirements take precedence over all stylistic choices."
)
if pinned_comments:
context_instructions.append(
"PINNED COMMENTS: this is a comment added by a project manager — treat them as high-priority guidance from the localization team."
)
if terms:
context_instructions.append(
"TERMINOLOGY: use the given translations for those terms consistently in your output, unless you believe the existing translation to be incorrect for the context."
)
context_block = (
"\n".join(context_instructions) + "\n\n" if context_instructions else ""
)

rephrased = textwrap.dedent(
f"""{intro_text}
Provide an alternative translation that preserves the original meaning while varying the wording.
{common_rules}
3) Cultural and Idiomatic Fit: adapt idioms and culturally marked expressions appropriately for {target_language}; you may restructure sentences but must not introduce new information or omit essential meaning.
4) Clarity and Naturalness: ensure the result reads naturally and is easy to understand.
Output only the alternative translation."""
system_rules = textwrap.dedent(
f"""\
Your goal is to produce a natural, grammatically correct translation. Follow these rules strictly; if rules conflict, earlier rules take priority.
1) ENDING PUNCTUATION — PRESERVE SEMANTICS
- Determine the ending punctuation of the English source text (ignore trailing spaces and HTML tags).
- The translation MUST end with the equivalent punctuation. Both directions are hard constraints:
• English ends with "." → translation MUST end with "." (or target-language equivalent)
• English ends with "?" → translation MUST end with a question mark
• English ends with "!" → translation MUST end with an exclamation mark
• English ends with "…" → translation MUST end with an ellipsis
• English has NO ending punctuation → translation MUST NOT end with ".", "?", "!", or "…"
- Apply correct punctuation conventions for the target language (e.g. Spanish "¿ ¡", French non-breaking space before "?", "!", ":").
2) HTML TAGS
- Preserve all HTML tags exactly as in the source:
- Do not add, remove, reorder, or modify tags or attributes
- Translate only the text content between tags, or the attributes if they contain translatable text (e.g., "alt", "title").
- Keep punctuation placement consistent with the source structure (do not move punctuation across tag boundaries unless required by the target language grammar).
3) {style_goal}

Output only the translation, with no explanation."""
)

system_messages = {
"informal": informal,
"formal": formal,
"rephrased": rephrased,
}
system_message = system_header + context_block + system_rules

system_message = system_messages.get(characteristic)
if system_message is None:
raise ValueError(f"Unrecognized characteristic: '{characteristic}'")

# Separate the instruction from the data.
# It makes it hard for injected text to masquerade as instructions.
user_prompt = (
f"{intro_text}\n\n"
f"ENGLISH SOURCE:\n{english_text}\n\n"
f"MACHINE TRANSLATION TO REFINE:\n{translated_text}"
)
# TODO: remove before merge.
# Print the full prompt before sending to help with debug.
if settings.DEBUG:
print(
f"[OpenAI] system:\n{system_message}\n\n[OpenAI] user:\n{user_prompt}"
)

# Call the OpenAI API with the constructed prompt
response = self.client.chat.completions.create(
model="gpt-4.1-2025-04-14",
model=settings.OPENAI_MODEL,
messages=[
{"role": "system", "content": system_message},
{"role": "user", "content": user_prompt},
Expand Down
75 changes: 72 additions & 3 deletions pontoon/machinery/tests/test_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
)
from pontoon.test.factories import (
EntityFactory,
SectionFactory,
TeamCommentFactory,
TermFactory,
TermTranslationFactory,
TranslationFactory,
TranslationMemoryFactory,
)
Expand Down Expand Up @@ -303,19 +307,84 @@ def test_view_gpt_transform_cache(member, locale_a, openai_api_key):
"english_text": "Hello",
"translated_text": "Hola",
"characteristic": "formal",
"locale": locale_a.name,
"locale": locale_a.code,
}

response1 = member.client.get(url, params)
response1 = member.client.post(url, params)
assert MockOpenAI.return_value.chat.completions.create.call_count == 1

# Second identical request should be served from cache
response2 = member.client.get(url, params)
response2 = member.client.post(url, params)
assert MockOpenAI.return_value.chat.completions.create.call_count == 1

assert json.loads(response1.content) == json.loads(response2.content)


@pytest.mark.django_db
def test_view_gpt_transform_context(member, locale_a, openai_api_key):
url = reverse("pontoon.gpt_transform")
cache.clear()

mock_response = MagicMock()
mock_response.choices[0].message.content = "translated"

# Create entity with full context: key, comment, group (section) comment,
# resource comment
section = SectionFactory(key=["nav"], comment="Navigation section")
entity = EntityFactory(
key=["open-browser"],
string="Open browser",
comment="Button label",
resource=section.resource,
section=section,
)
entity.resource.comment = "Main UI file"
entity.resource.save(update_fields=["comment"])

# Pinned comment
TeamCommentFactory(
entity=entity,
locale=locale_a,
content="<p>Use formal register</p>",
pinned=True,
)
TeamCommentFactory(
entity=entity, locale=locale_a, content="Keep it short", pinned=True
)

# Term matching the source string, with a translation for the target locale
term = TermFactory(
text="browser", part_of_speech="noun", definition="A web browser"
)
TermTranslationFactory(term=term, locale=locale_a, text="navigateur")

with patch("pontoon.machinery.openai_service.OpenAI") as MockOpenAI:
MockOpenAI.return_value.chat.completions.create.return_value = mock_response

member.client.post(
url,
{
"english_text": "Open browser",
"translated_text": "Ouvrir le navigateur",
"characteristic": "formal",
"locale": locale_a.code,
"entity_pk": entity.pk,
},
)

call_kwargs = MockOpenAI.return_value.chat.completions.create.call_args
user_message = call_kwargs.kwargs["messages"][1]["content"]
assert "STRING ID:\nopen-browser" in user_message
assert "STRING COMMENT:\nButton label" in user_message
assert "GROUP COMMENT:\nNavigation section" in user_message
assert "RESOURCE COMMENT:\nMain UI file" in user_message
assert "PINNED COMMENTS:" in user_message
assert "Use formal register" in user_message
assert "Keep it short" in user_message
assert "TERMINOLOGY:" in user_message
assert '"browser" (noun) → "navigateur"' in user_message


@pytest.mark.django_db
def test_view_caighdean(client, entity_a):
gd = Locale.objects.get(code="gd")
Expand Down
Loading
Loading