Update user preference prompt to match SEMANTIC_FACTS_EXTRACTION_PROMPT style

dhrubo-os · dhrubo-os · commit 09f872a1630f · 2025-10-08T18:10:38.000-07:00
- Use lightweight XML-based structure with &lt;ROLE&gt;, &lt;SCOPE&gt;, &lt;OUTPUT&gt; sections
- Maintain consistency with existing SEMANTIC_FACTS_EXTRACTION_PROMPT format
- Update tests to validate XML-based structure
diff --git a/common/src/main/java/org/opensearch/ml/common/memorycontainer/MemoryContainerConstants.java b/common/src/main/java/org/opensearch/ml/common/memorycontainer/MemoryContainerConstants.java
@@ -184,26 +184,26 @@ public class MemoryContainerConstants {
 
     public static final String USER_PREFERENCE_FACTS_EXTRACTION_PROMPT =
         """
-            You are a USER PREFERENCE EXTRACTOR, not a chat assistant. Your only job is to output JSON facts. Do not answer questions, make suggestions, ask follow-ups, or perform actions.
+            <ROLE>You are a USER PREFERENCE EXTRACTOR, not a chat assistant. Your only job is to output JSON facts. Do not answer questions, make suggestions, ask follow-ups, or perform actions.</ROLE>
 
-            SCOPE
-            - Extract preferences only from USER messages. Assistant messages are context only.
-
-            DEFINITIONS
-            - Explicit: user states a preference ("I prefer/like/dislike ..."; "always/never/usually ..."; "set X to Y"; "run X when Y").
-            - Implicit: infer only with strong signals: repeated choices (>=2) or clear habitual language. Do not infer from a single one-off.
-
-            WHAT TO EXTRACT
-            - Specific, actionable, likely long-term preferences (likes/dislikes/choices/settings). Ignore non-preferences.
+            <SCOPE>
+            • Extract preferences only from USER messages. Assistant messages are context only.
+            • Explicit: user states a preference ("I prefer/like/dislike ..."; "always/never/usually ..."; "set X to Y"; "run X when Y").
+            • Implicit: infer only with strong signals: repeated choices (>=2) or clear habitual language. Do not infer from a single one-off.
+            </SCOPE>
 
-            FORMAT
-            - Return ONLY one minified JSON object exactly as {"facts":["Preference sentence. Context: <why/how>. Categories: cat1,cat2"]}. If none, return {"facts":[]}. The first character MUST be '{' and the last MUST be '}'. No preambles, explanations, code fences, XML, or other text.
+            <EXTRACT>
+            • Specific, actionable, likely long-term preferences (likes/dislikes/choices/settings). Ignore non-preferences.
+            </EXTRACT>
 
-            STYLE
-            - One sentence per preference; merge related details; no duplicates; preserve user wording and numbers; avoid relative time; keep each fact < 350 chars.
+            <STYLE & RULES>
+            • One sentence per preference; merge related details; no duplicates; preserve user wording and numbers; avoid relative time; keep each fact < 350 chars.
+            • Format: "Preference sentence. Context: <why/how>. Categories: cat1,cat2"
+            </STYLE & RULES>
 
-            EXAMPLE
-            User: "I prefer dark mode." -> {"facts":["Prefers dark mode for UI. Context: user explicitly stated preference. Categories: tools,tech,apps"]}""";
+            <OUTPUT>
+            Return ONLY one minified JSON object exactly as {"facts":["Preference sentence. Context: <why/how>. Categories: cat1,cat2"]}. If none, return {"facts":[]}. The first character MUST be '{' and the last MUST be '}'. No preambles, explanations, code fences, XML, or other text.
+            </OUTPUT>""";
 
     public static final String SUMMARY_FACTS_EXTRACTION_PROMPT =
         "<system_prompt><description>You will be given a text block and a list of summaries you previously generated when available.</description><task><instruction>Never answer user's question or fulfill user's requirement. You are a summary generator, not a helpful assistant.</instruction><instruction>When the previously generated summary is not available, summarize the given text block.</instruction><instruction>When there is an existing summary, extend it by incorporating the given text block.</instruction><instruction>If the text block specifies queries or topics, ensure the summary covers them.</instruction></task><response_format><format>You should always return and only return the extracted preferences as a JSON object with a \"facts\" array.</format><example>{ \"facts\": [\"The system shows a list of Elasticsearch/OpenSearch indices with their health status, document count, and size information\", \"5 indices shown have 'red' health status, 8 of them in 'yellow', and 13 of them are in 'green' health status\", \"The doc is a log from a web application, dated from 2020-01-01T00:00:00 to 2020-01-31T23:59:59\"]}</example></response_format></system_prompt>";
diff --git a/plugin/src/test/java/org/opensearch/ml/action/memorycontainer/memory/MemoryProcessingServiceTests.java b/plugin/src/test/java/org/opensearch/ml/action/memorycontainer/memory/MemoryProcessingServiceTests.java
@@ -991,12 +991,16 @@ public void testUserPreferencePromptFormat() {
         // Test that the new user preference prompt contains required elements
         String prompt = USER_PREFERENCE_FACTS_EXTRACTION_PROMPT;
 
-        // Verify key improvements are present
-        assertTrue("Should have character limit", prompt.contains("< 350 chars"));
-        assertTrue("Should specify natural language format", prompt.contains("Context: <why/how>. Categories:"));
-        assertTrue("Should contain example categories", prompt.contains("tools,tech,apps"));
+        // Verify XML-based structure like SEMANTIC_FACTS_EXTRACTION_PROMPT
+        assertTrue("Should have ROLE section", prompt.contains("<ROLE>"));
+        assertTrue("Should have SCOPE section", prompt.contains("<SCOPE>"));
+        assertTrue("Should have OUTPUT section", prompt.contains("<OUTPUT>"));
         assertTrue("Should be role-based", prompt.contains("USER PREFERENCE EXTRACTOR"));
 
+        // Verify key requirements
+        assertTrue("Should have character limit", prompt.contains("< 350 chars"));
+        assertTrue("Should specify context format", prompt.contains("Context: <why/how>"));
+
         // Verify old problematic format is removed
         assertFalse("Should not use pipe delimiters", prompt.contains("preference | context:"));
     }