Skip to content

Commit 5a56d9c

Browse files
committed
Use additional context in LLM prompt, update OpenAI dep
- Update OpenAI package to latest version (2.29.0) - Improve prompt formulation - Pass string ID, comment, terminology matches when available - Move OpenAI GPT version to settings
1 parent e15ef86 commit 5a56d9c

15 files changed

Lines changed: 728 additions & 138 deletions

File tree

pontoon/machinery/management/commands/refine_translation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55

66
class Command(BaseCommand):
7-
help = "Refines machine translations using OpenAI's GPT-4 API with specified characteristics"
7+
help = "Refines machine translations using OpenAI's GPT API with specified characteristics"
88

99
def add_arguments(self, parser):
1010
parser.add_argument(

pontoon/machinery/openai_service.py

Lines changed: 128 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,36 @@ def __init__(self):
1919
self.client = OpenAI()
2020

2121
def get_translation(
22-
self, english_text, translated_text, characteristic, target_language_name
22+
self,
23+
english_text,
24+
translated_text,
25+
characteristic,
26+
target_language_name,
27+
string_id=None,
28+
string_comment=None,
29+
group_comment=None,
30+
resource_comment=None,
31+
pinned_comments=None,
32+
terms=None,
2333
):
34+
terms_cache_key = (
35+
str(sorted((t.get("text", "") for t in terms))) if terms else ""
36+
)
37+
pinned_comments_cache_key = (
38+
str(sorted(pinned_comments)) if pinned_comments else ""
39+
)
2440
cache_key = get_machinery_service_cache_key(
2541
"openai_chatgpt",
2642
english_text,
2743
translated_text,
2844
characteristic,
2945
target_language_name,
46+
string_id or "",
47+
string_comment or "",
48+
group_comment or "",
49+
resource_comment or "",
50+
pinned_comments_cache_key,
51+
terms_cache_key,
3052
)
3153
cached = cache.get(cache_key)
3254
if cached is not None:
@@ -39,70 +61,122 @@ def get_translation(
3961
f"The target language '{target_language_name}' is not supported."
4062
)
4163

42-
intro_text = f"Refine the {target_language} machine translation below to make it {characteristic}."
43-
44-
common_rules = textwrap.dedent(
45-
"""Follow these rules IN ORDER OF PRIORITY:
46-
1) ENDING PUNCTUATION — SEMANTICS, NOT LITERAL CHAR:
47-
- Detect the English ending: none, ".", "?", "!", "…".
48-
- The translation MUST express the same ending SEMANTIC:
49-
• if English ends with "?" → translation ends with a question.
50-
• if English ends with "!" → translation ends with an exclamation.
51-
• if English ends with "…" → translation ends with an ellipsis.
52-
• if English has NO closing punctuation → translation MUST NOT end with ".", "?", "!", or "…".
53-
- Do not add a final period if the English has none.
54-
- Respect orthographic and typographic rules of the target language regarding punctuation, like using non-breaking spaces in French, adding opening "¿" or "¡" in Spanish, etc.
55-
2) Preserve all HTML tags and their order. Do not add, remove, or reorder tags."""
56-
)
64+
style_goals = {
65+
"informal": f"Use simple, everyday {target_language_name} ({target_language.code}) — avoid jargon, technical terms, and formal constructions.",
66+
"formal": f"Use formal {target_language_name} ({target_language.code}) throughout; maintain a consistent register and do not mix formal and informal modes.",
67+
"rephrased": f"Provide an alternative wording that preserves the original meaning; adapt idioms and culturally marked expressions for {target_language_name} ({target_language.code}); you may restructure sentences but must not introduce new information or omit essential meaning.",
68+
}
5769

58-
informal = textwrap.dedent(
59-
f"""{intro_text}
60-
Revise the {target_language} translation to use simpler language.
61-
{common_rules}
62-
3) Clarity and Simplicity: keep wording straightforward and consistent.
63-
Output only the revised translation."""
64-
)
70+
style_goal = style_goals.get(characteristic)
71+
if style_goal is None:
72+
raise ValueError(f"Unrecognized characteristic: '{characteristic}'")
6573

66-
formal = textwrap.dedent(
67-
f"""{intro_text}
68-
Revise the {target_language} translation to use a higher level of formality.
69-
{common_rules}
70-
3) Consistency: maintain a consistent level of formality throughout; do not mix formal and informal modes.
71-
4) Preserve all HTML tags and their order. Do not add, remove, or reorder tags.
72-
5) Clarity and Precision: keep wording clear and unambiguous while remaining formal.
73-
Output only the revised translation."""
74+
# Separate the instruction from the data.
75+
# It makes it hard for injected text to masquerade as instructions.
76+
context_parts = []
77+
if string_id:
78+
context_parts.append(f"STRING ID:\n{string_id}")
79+
if resource_comment:
80+
context_parts.append(f"RESOURCE COMMENT:\n{resource_comment}")
81+
if group_comment:
82+
context_parts.append(f"GROUP COMMENT:\n{group_comment}")
83+
if string_comment:
84+
context_parts.append(f"STRING COMMENT:\n{string_comment}")
85+
if pinned_comments:
86+
pinned_block = "\n".join(f"- {c}" for c in pinned_comments)
87+
context_parts.append(f"PINNED COMMENTS:\n{pinned_block}")
88+
if terms:
89+
term_lines = []
90+
for term in terms:
91+
text = term.get("text", "")
92+
pos = term.get("part_of_speech", "")
93+
translation = term.get("translation", "")
94+
parts = [f'"{text}"']
95+
if pos:
96+
parts.append(f"({pos})")
97+
if translation:
98+
parts.append(f'→ "{translation}"')
99+
term_lines.append(" ".join(parts))
100+
terms_block = "\n".join(f"- {line}" for line in term_lines)
101+
context_parts.append(
102+
f"TERMINOLOGY:\nThese are terminology matches in the source text that you should consider:\n{terms_block}"
103+
)
104+
context_parts.append(f"ENGLISH SOURCE:\n{english_text}")
105+
context_parts.append(f"MACHINE TRANSLATION (for reference):\n{translated_text}")
106+
user_prompt = "\n\n".join(context_parts)
107+
108+
system_header = textwrap.dedent(
109+
f"""\
110+
You are an expert {target_language_name} ({target_language.code}) localization specialist.
111+
112+
Your task: produce a {characteristic} {target_language_name} ({target_language.code}) translation of a UI string.
113+
Use the provided machine translation as a reference, but you are not bound by it — rewrite freely to achieve the best result.
114+
"""
74115
)
75116

76-
rephrased = textwrap.dedent(
77-
f"""{intro_text}
78-
Provide an alternative translation that preserves the original meaning while varying the wording.
79-
{common_rules}
80-
3) Cultural and Idiomatic Fit: adapt idioms and culturally marked expressions appropriately for {target_language}; you may restructure sentences but must not introduce new information or omit essential meaning.
81-
4) Clarity and Naturalness: ensure the result reads naturally and is easy to understand.
82-
Output only the alternative translation."""
117+
context_instructions = []
118+
if string_id:
119+
context_instructions.append(
120+
"STRING ID: use it to infer the UI context (e.g., button, menu item, page title, tooltip) and adapt length and phrasing accordingly."
121+
)
122+
if resource_comment:
123+
context_instructions.append(
124+
"RESOURCE COMMENT: general notes about the file — use it as additional context."
125+
)
126+
if group_comment:
127+
context_instructions.append(
128+
"GROUP COMMENT: notes about the group of messages this string belongs to — use it as additional context."
129+
)
130+
if string_comment:
131+
context_instructions.append(
132+
"STRING COMMENT: treat it as authoritative translator notes — it may specify placeholders to preserve exactly, terms that must not be translated, or other constraints. STRING COMMENT requirements take precedence over all stylistic choices."
133+
)
134+
if pinned_comments:
135+
context_instructions.append(
136+
"PINNED COMMENTS: this is a comment added by a project manager — treat them as high-priority guidance from the localization team."
137+
)
138+
if terms:
139+
context_instructions.append(
140+
"TERMINOLOGY: use the given translations for those terms consistently in your output, unless you believe the existing translation to be incorrect for the context."
141+
)
142+
context_block = (
143+
"\n".join(context_instructions) + "\n\n" if context_instructions else ""
83144
)
84145

85-
system_messages = {
86-
"informal": informal,
87-
"formal": formal,
88-
"rephrased": rephrased,
89-
}
146+
system_rules = textwrap.dedent(
147+
f"""\
148+
Your goal is to produce a natural, grammatically correct translation. Follow these rules strictly; if rules conflict, earlier rules take priority.
149+
1) ENDING PUNCTUATION — PRESERVE SEMANTICS
150+
- Determine the ending punctuation of the English source text (ignore trailing spaces and HTML tags).
151+
- The translation MUST end with the equivalent punctuation. Both directions are hard constraints:
152+
• English ends with "." → translation MUST end with "." (or target-language equivalent)
153+
• English ends with "?" → translation MUST end with a question mark
154+
• English ends with "!" → translation MUST end with an exclamation mark
155+
• English ends with "…" → translation MUST end with an ellipsis
156+
• English has NO ending punctuation → translation MUST NOT end with ".", "?", "!", or "…"
157+
- Apply correct punctuation conventions for the target language (e.g. Spanish "¿ ¡", French non-breaking space before "?", "!", ":").
158+
2) HTML TAGS
159+
- Preserve all HTML tags exactly as in the source:
160+
- Do not add, remove, reorder, or modify tags or attributes
161+
- Translate only the text content between tags, or the attributes if they contain translatable text (e.g., "alt", "title").
162+
- Keep punctuation placement consistent with the source structure (do not move punctuation across tag boundaries unless required by the target language grammar).
163+
3) {style_goal}
164+
165+
Output only the translation, with no explanation."""
166+
)
90167

91-
system_message = system_messages.get(characteristic)
92-
if system_message is None:
93-
raise ValueError(f"Unrecognized characteristic: '{characteristic}'")
168+
system_message = system_header + context_block + system_rules
94169

95-
# Separate the instruction from the data.
96-
# It makes it hard for injected text to masquerade as instructions.
97-
user_prompt = (
98-
f"{intro_text}\n\n"
99-
f"ENGLISH SOURCE:\n{english_text}\n\n"
100-
f"MACHINE TRANSLATION TO REFINE:\n{translated_text}"
101-
)
170+
# TODO: remove before merge.
171+
# Print the full prompt before sending to help with debug.
172+
if settings.DEBUG:
173+
print(
174+
f"[OpenAI] system:\n{system_message}\n\n[OpenAI] user:\n{user_prompt}"
175+
)
102176

103177
# Call the OpenAI API with the constructed prompt
104178
response = self.client.chat.completions.create(
105-
model="gpt-4.1-2025-04-14",
179+
model=settings.OPENAI_MODEL,
106180
messages=[
107181
{"role": "system", "content": system_message},
108182
{"role": "user", "content": user_prompt},

pontoon/machinery/tests/test_views.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,49 @@ def test_view_gpt_transform_cache(member, locale_a, openai_api_key):
316316
assert json.loads(response1.content) == json.loads(response2.content)
317317

318318

319+
@pytest.mark.django_db
320+
def test_view_gpt_transform_terms(member, locale_a, openai_api_key):
321+
url = reverse("pontoon.gpt_transform")
322+
cache.clear()
323+
324+
mock_response = MagicMock()
325+
mock_response.choices[0].message.content = "translated"
326+
327+
terms = json.dumps(
328+
[{"text": "browser", "part_of_speech": "noun", "translation": "navigateur"}]
329+
)
330+
331+
with patch("pontoon.machinery.openai_service.OpenAI") as MockOpenAI:
332+
MockOpenAI.return_value.chat.completions.create.return_value = mock_response
333+
334+
params = {
335+
"english_text": "Open browser",
336+
"translated_text": "Ouvrir le navigateur",
337+
"characteristic": "formal",
338+
"locale": locale_a.name,
339+
"string_id": "open-browser",
340+
"string_comment": "Button label",
341+
"group_comment": "Navigation section",
342+
"resource_comment": "Main UI file",
343+
"pinned_comments": json.dumps(["Use formal register", "Keep it short"]),
344+
"terms": terms,
345+
}
346+
347+
member.client.get(url, params)
348+
349+
call_kwargs = MockOpenAI.return_value.chat.completions.create.call_args
350+
user_message = call_kwargs.kwargs["messages"][1]["content"]
351+
assert "STRING ID:\nopen-browser" in user_message
352+
assert "STRING COMMENT:\nButton label" in user_message
353+
assert "GROUP COMMENT:\nNavigation section" in user_message
354+
assert "RESOURCE COMMENT:\nMain UI file" in user_message
355+
assert "PINNED COMMENTS:" in user_message
356+
assert "Use formal register" in user_message
357+
assert "Keep it short" in user_message
358+
assert "TERMINOLOGY:" in user_message
359+
assert '"browser" (noun) → "navigateur"' in user_message
360+
361+
319362
@pytest.mark.django_db
320363
def test_view_caighdean(client, entity_a):
321364
gd = Locale.objects.get(code="gd")

pontoon/machinery/views.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -144,22 +144,38 @@ def google_translate(request):
144144
@login_required(redirect_field_name="", login_url="/403")
145145
def gpt_transform(request):
146146
"""
147-
Transforms and returns text using GPT-4 based on specified characteristics like rephrasing or changing formality, by fetching English text, its machine translation, desired transformation characteristic, and target language from the request.
147+
Transforms and returns text using GPT based on specified characteristics like rephrasing or changing formality, by fetching English text, its machine translation, desired transformation characteristic, and target language from the request.
148148
"""
149149
try:
150150
english_text = request.GET.get("english_text")
151151
translated_text = request.GET.get("translated_text")
152152
characteristic = request.GET.get("characteristic")
153153
target_language_name = request.GET.get("locale")
154+
string_id = request.GET.get("string_id")
155+
string_comment = request.GET.get("string_comment")
156+
group_comment = request.GET.get("group_comment")
157+
resource_comment = request.GET.get("resource_comment")
158+
pinned_comments_json = request.GET.get("pinned_comments")
159+
pinned_comments = (
160+
json.loads(pinned_comments_json) if pinned_comments_json else None
161+
)
162+
terms_json = request.GET.get("terms")
163+
terms = json.loads(terms_json) if terms_json else None
154164

155165
service = OpenAIService()
156-
return JsonResponse(
157-
{
158-
"translation": service.get_translation(
159-
english_text, translated_text, characteristic, target_language_name
160-
)
161-
}
166+
transformed_text = service.get_translation(
167+
english_text,
168+
translated_text,
169+
characteristic,
170+
target_language_name,
171+
string_id=string_id,
172+
string_comment=string_comment,
173+
group_comment=group_comment,
174+
resource_comment=resource_comment,
175+
pinned_comments=pinned_comments,
176+
terms=terms,
162177
)
178+
return JsonResponse({"translation": transformed_text})
163179

164180
except Exception as e:
165181
return _machinery_error_response("GPT Transform", e)

pontoon/settings/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ def path(*args):
180180

181181
# Microsoft Translator API Key
182182
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
183+
OPENAI_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4.1-2025-04-14")
183184

184185
# Google Analytics Key
185186
GOOGLE_ANALYTICS_KEY = os.environ.get("GOOGLE_ANALYTICS_KEY", "")

requirements/default.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ jsonfield==3.1.0
3939
markupsafe==2.0.1
4040
moz.l10n[xml]==0.11.2
4141
newrelic==9.6.0
42-
openai==1.99.9
42+
openai==2.29.0
4343
psycopg2==2.9.6
4444
PyJWT==2.12.0
4545
python-dateutil==2.9.0

0 commit comments

Comments
 (0)