diff --git a/posthog/ai/gemini/gemini_converter.py b/posthog/ai/gemini/gemini_converter.py index bfa3625a..863b5899 100644 --- a/posthog/ai/gemini/gemini_converter.py +++ b/posthog/ai/gemini/gemini_converter.py @@ -29,35 +29,76 @@ class GeminiMessage(TypedDict, total=False): text: str -def _extract_text_from_parts(parts: List[Any]) -> str: +def _format_parts_as_content_blocks(parts: List[Any]) -> List[FormattedContentItem]: """ - Extract and concatenate text from a parts array. + Format Gemini parts array into structured content blocks. + + Preserves structure for multimodal content (text + images) instead of + concatenating everything into a string. Args: - parts: List of parts that may contain text content + parts: List of parts that may contain text, inline_data, etc. Returns: - Concatenated text from all parts + List of formatted content blocks """ - - content_parts = [] + content_blocks: List[FormattedContentItem] = [] for part in parts: + # Handle dict with text field if isinstance(part, dict) and "text" in part: - content_parts.append(part["text"]) + content_blocks.append({"type": "text", "text": part["text"]}) + # Handle string parts elif isinstance(part, str): - content_parts.append(part) + content_blocks.append({"type": "text", "text": part}) + + # Handle dict with inline_data (images, documents, etc.) + elif isinstance(part, dict) and "inline_data" in part: + inline_data = part["inline_data"] + mime_type = inline_data.get("mime_type", "") + content_type = "image" if mime_type.startswith("image/") else "document" + + content_blocks.append( + { + "type": content_type, + "inline_data": inline_data, + } + ) + # Handle object with text attribute elif hasattr(part, "text"): - # Get the text attribute value text_value = getattr(part, "text", "") - content_parts.append(text_value if text_value else str(part)) - - else: - content_parts.append(str(part)) + if text_value: + content_blocks.append({"type": "text", "text": text_value}) + + # Handle object with inline_data attribute + elif hasattr(part, "inline_data"): + inline_data = part.inline_data + # Convert to dict if needed + if hasattr(inline_data, "mime_type") and hasattr(inline_data, "data"): + # Determine type based on mime_type + mime_type = inline_data.mime_type + content_type = "image" if mime_type.startswith("image/") else "document" + + content_blocks.append( + { + "type": content_type, + "inline_data": { + "mime_type": mime_type, + "data": inline_data.data, + }, + } + ) + else: + content_blocks.append( + { + "type": "image", + "inline_data": inline_data, + } + ) - return "".join(content_parts) + return content_blocks def _format_dict_message(item: Dict[str, Any]) -> FormattedMessage: @@ -73,16 +114,17 @@ def _format_dict_message(item: Dict[str, Any]) -> FormattedMessage: # Handle dict format with parts array (Gemini-specific format) if "parts" in item and isinstance(item["parts"], list): - content = _extract_text_from_parts(item["parts"]) - return {"role": item.get("role", "user"), "content": content} + content_blocks = _format_parts_as_content_blocks(item["parts"]) + return {"role": item.get("role", "user"), "content": content_blocks} # Handle dict with content field if "content" in item: content = item["content"] if isinstance(content, list): - # If content is a list, extract text from it - content = _extract_text_from_parts(content) + # If content is a list, format it as content blocks + content_blocks = _format_parts_as_content_blocks(content) + return {"role": item.get("role", "user"), "content": content_blocks} elif not isinstance(content, str): content = str(content) @@ -110,14 +152,14 @@ def _format_object_message(item: Any) -> FormattedMessage: # Handle object with parts attribute if hasattr(item, "parts") and hasattr(item.parts, "__iter__"): - content = _extract_text_from_parts(item.parts) + content_blocks = _format_parts_as_content_blocks(list(item.parts)) role = getattr(item, "role", "user") if hasattr(item, "role") else "user" # Ensure role is a string if not isinstance(role, str): role = "user" - return {"role": role, "content": content} + return {"role": role, "content": content_blocks} # Handle object with text attribute if hasattr(item, "text"): @@ -140,7 +182,8 @@ def _format_object_message(item: Any) -> FormattedMessage: content = item.content if isinstance(content, list): - content = _extract_text_from_parts(content) + content_blocks = _format_parts_as_content_blocks(content) + return {"role": role, "content": content_blocks} elif not isinstance(content, str): content = str(content) @@ -193,6 +236,29 @@ def format_gemini_response(response: Any) -> List[FormattedMessage]: } ) + elif hasattr(part, "inline_data") and part.inline_data: + # Handle audio/media inline data + import base64 + + inline_data = part.inline_data + mime_type = getattr(inline_data, "mime_type", "audio/pcm") + raw_data = getattr(inline_data, "data", b"") + + # Encode binary data as base64 string for JSON serialization + if isinstance(raw_data, bytes): + data = base64.b64encode(raw_data).decode("utf-8") + else: + # Already a string (base64) + data = raw_data + + content.append( + { + "type": "audio", + "mime_type": mime_type, + "data": data, + } + ) + if content: output.append( { diff --git a/posthog/ai/openai/openai_converter.py b/posthog/ai/openai/openai_converter.py index e7b8ce5e..5b7eac8c 100644 --- a/posthog/ai/openai/openai_converter.py +++ b/posthog/ai/openai/openai_converter.py @@ -67,6 +67,12 @@ def format_openai_response(response: Any) -> List[FormattedMessage]: } ) + # Handle audio output (gpt-4o-audio-preview) + if hasattr(choice.message, "audio") and choice.message.audio: + # Convert Pydantic model to dict to capture all fields from OpenAI + audio_dict = choice.message.audio.model_dump() + content.append({"type": "audio", **audio_dict}) + if content: output.append( { diff --git a/posthog/ai/sanitization.py b/posthog/ai/sanitization.py index a0953d07..5f259c56 100644 --- a/posthog/ai/sanitization.py +++ b/posthog/ai/sanitization.py @@ -1,3 +1,4 @@ +import os import re from typing import Any from urllib.parse import urlparse @@ -5,6 +6,15 @@ REDACTED_IMAGE_PLACEHOLDER = "[base64 image redacted]" +def _is_multimodal_enabled() -> bool: + """Check if multimodal capture is enabled via environment variable.""" + return os.environ.get("_INTERNAL_LLMA_MULTIMODAL", "").lower() in ( + "true", + "1", + "yes", + ) + + def is_base64_data_url(text: str) -> bool: return re.match(r"^data:([^;]+);base64,", text) is not None @@ -27,6 +37,9 @@ def is_raw_base64(text: str) -> bool: def redact_base64_data_url(value: Any) -> Any: + if _is_multimodal_enabled(): + return value + if not isinstance(value, str): return value @@ -83,6 +96,11 @@ def sanitize_openai_image(item: Any) -> Any: }, } + if item.get("type") == "audio" and "data" in item: + if _is_multimodal_enabled(): + return item + return {**item, "data": REDACTED_IMAGE_PLACEHOLDER} + return item @@ -100,6 +118,9 @@ def sanitize_openai_response_image(item: Any) -> Any: def sanitize_anthropic_image(item: Any) -> Any: + if _is_multimodal_enabled(): + return item + if not isinstance(item, dict): return item @@ -109,8 +130,6 @@ def sanitize_anthropic_image(item: Any) -> Any: and item["source"].get("type") == "base64" and "data" in item["source"] ): - # For Anthropic, if the source type is "base64", we should always redact the data - # The provider is explicitly telling us this is base64 data return { **item, "source": { @@ -123,6 +142,9 @@ def sanitize_anthropic_image(item: Any) -> Any: def sanitize_gemini_part(part: Any) -> Any: + if _is_multimodal_enabled(): + return part + if not isinstance(part, dict): return part @@ -131,8 +153,6 @@ def sanitize_gemini_part(part: Any) -> Any: and isinstance(part["inline_data"], dict) and "data" in part["inline_data"] ): - # For Gemini, the inline_data structure indicates base64 data - # We should redact any string data in this context return { **part, "inline_data": { @@ -185,7 +205,9 @@ def sanitize_langchain_image(item: Any) -> Any: and isinstance(item.get("source"), dict) and "data" in item["source"] ): - # Anthropic style - raw base64 in structured format, always redact + if _is_multimodal_enabled(): + return item + return { **item, "source": { diff --git a/posthog/test/ai/gemini/test_gemini.py b/posthog/test/ai/gemini/test_gemini.py index e0c216a0..0cf763ba 100644 --- a/posthog/test/ai/gemini/test_gemini.py +++ b/posthog/test/ai/gemini/test_gemini.py @@ -407,7 +407,9 @@ def test_new_client_different_input_formats( ) call_args = mock_client.capture.call_args[1] props = call_args["properties"] - assert props["$ai_input"] == [{"role": "user", "content": "hey"}] + assert props["$ai_input"] == [ + {"role": "user", "content": [{"type": "text", "text": "hey"}]} + ] # Test multiple parts in the parts array mock_client.reset_mock() @@ -418,7 +420,15 @@ def test_new_client_different_input_formats( ) call_args = mock_client.capture.call_args[1] props = call_args["properties"] - assert props["$ai_input"] == [{"role": "user", "content": "Hello world"}] + assert props["$ai_input"] == [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Hello "}, + {"type": "text", "text": "world"}, + ], + } + ] # Test list input with string mock_client.capture.reset_mock() diff --git a/posthog/test/ai/gemini/test_gemini_async.py b/posthog/test/ai/gemini/test_gemini_async.py index 624095f9..ad7decf3 100644 --- a/posthog/test/ai/gemini/test_gemini_async.py +++ b/posthog/test/ai/gemini/test_gemini_async.py @@ -392,7 +392,9 @@ async def test_async_client_different_input_formats( ) call_args = mock_client.capture.call_args[1] props = call_args["properties"] - assert props["$ai_input"] == [{"role": "user", "content": "hey"}] + assert props["$ai_input"] == [ + {"role": "user", "content": [{"type": "text", "text": "hey"}]} + ] # Test multiple parts in the parts array mock_client.reset_mock() @@ -403,7 +405,15 @@ async def test_async_client_different_input_formats( ) call_args = mock_client.capture.call_args[1] props = call_args["properties"] - assert props["$ai_input"] == [{"role": "user", "content": "Hello world"}] + assert props["$ai_input"] == [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Hello "}, + {"type": "text", "text": "world"}, + ], + } + ] # Test list input with string mock_client.capture.reset_mock() diff --git a/posthog/test/ai/test_sanitization.py b/posthog/test/ai/test_sanitization.py index 0031bafb..24d1f2ea 100644 --- a/posthog/test/ai/test_sanitization.py +++ b/posthog/test/ai/test_sanitization.py @@ -1,3 +1,4 @@ +import os import unittest from posthog.ai.sanitization import ( @@ -331,5 +332,191 @@ def test_sanitize_handles_single_message(self): ) +class TestAIMultipartRequest(unittest.TestCase): + """Test that _INTERNAL_LLMA_MULTIMODAL environment variable controls sanitization.""" + + def tearDown(self): + # Clean up environment variable after each test + if "_INTERNAL_LLMA_MULTIMODAL" in os.environ: + del os.environ["_INTERNAL_LLMA_MULTIMODAL"] + + def test_multimodal_disabled_redacts_images(self): + """When _INTERNAL_LLMA_MULTIMODAL is not set, images should be redacted.""" + if "_INTERNAL_LLMA_MULTIMODAL" in os.environ: + del os.environ["_INTERNAL_LLMA_MULTIMODAL"] + + base64_image = "..." + result = redact_base64_data_url(base64_image) + self.assertEqual(result, REDACTED_IMAGE_PLACEHOLDER) + + def test_multimodal_enabled_preserves_images(self): + """When _INTERNAL_LLMA_MULTIMODAL is true, images should be preserved.""" + os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "true" + + base64_image = "..." + result = redact_base64_data_url(base64_image) + self.assertEqual(result, base64_image) + + def test_multimodal_enabled_with_1(self): + """_INTERNAL_LLMA_MULTIMODAL=1 should enable multimodal.""" + os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "1" + + base64_image = "..." + result = redact_base64_data_url(base64_image) + self.assertEqual(result, base64_image) + + def test_multimodal_enabled_with_yes(self): + """_INTERNAL_LLMA_MULTIMODAL=yes should enable multimodal.""" + os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "yes" + + base64_image = "..." + result = redact_base64_data_url(base64_image) + self.assertEqual(result, base64_image) + + def test_multimodal_false_redacts_images(self): + """_INTERNAL_LLMA_MULTIMODAL=false should still redact.""" + os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "false" + + base64_image = "..." + result = redact_base64_data_url(base64_image) + self.assertEqual(result, REDACTED_IMAGE_PLACEHOLDER) + + def test_anthropic_multimodal_enabled(self): + """Anthropic images should be preserved when _INTERNAL_LLMA_MULTIMODAL is enabled.""" + os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "true" + + input_data = [ + { + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/jpeg", + "data": "base64data", + }, + } + ], + } + ] + + result = sanitize_anthropic(input_data) + self.assertEqual(result[0]["content"][0]["source"]["data"], "base64data") + + def test_gemini_multimodal_enabled(self): + """Gemini images should be preserved when _INTERNAL_LLMA_MULTIMODAL is enabled.""" + os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "true" + + input_data = [ + { + "parts": [ + {"inline_data": {"mime_type": "image/jpeg", "data": "base64data"}} + ] + } + ] + + result = sanitize_gemini(input_data) + self.assertEqual(result[0]["parts"][0]["inline_data"]["data"], "base64data") + + def test_langchain_anthropic_style_multimodal_enabled(self): + """LangChain Anthropic-style images should be preserved when _INTERNAL_LLMA_MULTIMODAL is enabled.""" + os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "true" + + input_data = [ + { + "role": "user", + "content": [ + { + "type": "image", + "source": {"data": "base64data"}, + } + ], + } + ] + + result = sanitize_langchain(input_data) + self.assertEqual(result[0]["content"][0]["source"]["data"], "base64data") + + def test_openai_audio_redacted_by_default(self): + """OpenAI audio should be redacted when _INTERNAL_LLMA_MULTIMODAL is not set.""" + if "_INTERNAL_LLMA_MULTIMODAL" in os.environ: + del os.environ["_INTERNAL_LLMA_MULTIMODAL"] + + input_data = [ + { + "role": "assistant", + "content": [ + {"type": "audio", "data": "base64audiodata", "id": "audio_123"} + ], + } + ] + + result = sanitize_openai(input_data) + self.assertEqual(result[0]["content"][0]["data"], REDACTED_IMAGE_PLACEHOLDER) + self.assertEqual(result[0]["content"][0]["id"], "audio_123") + + def test_openai_audio_preserved_with_flag(self): + """OpenAI audio should be preserved when _INTERNAL_LLMA_MULTIMODAL is enabled.""" + os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "true" + + input_data = [ + { + "role": "assistant", + "content": [ + {"type": "audio", "data": "base64audiodata", "id": "audio_123"} + ], + } + ] + + result = sanitize_openai(input_data) + self.assertEqual(result[0]["content"][0]["data"], "base64audiodata") + + def test_gemini_audio_redacted_by_default(self): + """Gemini audio should be redacted when _INTERNAL_LLMA_MULTIMODAL is not set.""" + if "_INTERNAL_LLMA_MULTIMODAL" in os.environ: + del os.environ["_INTERNAL_LLMA_MULTIMODAL"] + + input_data = [ + { + "parts": [ + { + "inline_data": { + "mime_type": "audio/L16;codec=pcm;rate=24000", + "data": "base64audiodata", + } + } + ] + } + ] + + result = sanitize_gemini(input_data) + self.assertEqual( + result[0]["parts"][0]["inline_data"]["data"], REDACTED_IMAGE_PLACEHOLDER + ) + + def test_gemini_audio_preserved_with_flag(self): + """Gemini audio should be preserved when _INTERNAL_LLMA_MULTIMODAL is enabled.""" + os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "true" + + input_data = [ + { + "parts": [ + { + "inline_data": { + "mime_type": "audio/L16;codec=pcm;rate=24000", + "data": "base64audiodata", + } + } + ] + } + ] + + result = sanitize_gemini(input_data) + self.assertEqual( + result[0]["parts"][0]["inline_data"]["data"], "base64audiodata" + ) + + if __name__ == "__main__": unittest.main()