diff --git a/posthog/ai/gemini/gemini_converter.py b/posthog/ai/gemini/gemini_converter.py
index bfa3625a..863b5899 100644
--- a/posthog/ai/gemini/gemini_converter.py
+++ b/posthog/ai/gemini/gemini_converter.py
@@ -29,35 +29,76 @@ class GeminiMessage(TypedDict, total=False):
     text: str
 
 
-def _extract_text_from_parts(parts: List[Any]) -> str:
+def _format_parts_as_content_blocks(parts: List[Any]) -> List[FormattedContentItem]:
     """
-    Extract and concatenate text from a parts array.
+    Format Gemini parts array into structured content blocks.
+
+    Preserves structure for multimodal content (text + images) instead of
+    concatenating everything into a string.
 
     Args:
-        parts: List of parts that may contain text content
+        parts: List of parts that may contain text, inline_data, etc.
 
     Returns:
-        Concatenated text from all parts
+        List of formatted content blocks
     """
-
-    content_parts = []
+    content_blocks: List[FormattedContentItem] = []
 
     for part in parts:
+        # Handle dict with text field
         if isinstance(part, dict) and "text" in part:
-            content_parts.append(part["text"])
+            content_blocks.append({"type": "text", "text": part["text"]})
 
+        # Handle string parts
         elif isinstance(part, str):
-            content_parts.append(part)
+            content_blocks.append({"type": "text", "text": part})
+
+        # Handle dict with inline_data (images, documents, etc.)
+        elif isinstance(part, dict) and "inline_data" in part:
+            inline_data = part["inline_data"]
+            mime_type = inline_data.get("mime_type", "")
+            content_type = "image" if mime_type.startswith("image/") else "document"
+
+            content_blocks.append(
+                {
+                    "type": content_type,
+                    "inline_data": inline_data,
+                }
+            )
 
+        # Handle object with text attribute
         elif hasattr(part, "text"):
-            # Get the text attribute value
             text_value = getattr(part, "text", "")
-            content_parts.append(text_value if text_value else str(part))
-
-        else:
-            content_parts.append(str(part))
+            if text_value:
+                content_blocks.append({"type": "text", "text": text_value})
+
+        # Handle object with inline_data attribute
+        elif hasattr(part, "inline_data"):
+            inline_data = part.inline_data
+            # Convert to dict if needed
+            if hasattr(inline_data, "mime_type") and hasattr(inline_data, "data"):
+                # Determine type based on mime_type
+                mime_type = inline_data.mime_type
+                content_type = "image" if mime_type.startswith("image/") else "document"
+
+                content_blocks.append(
+                    {
+                        "type": content_type,
+                        "inline_data": {
+                            "mime_type": mime_type,
+                            "data": inline_data.data,
+                        },
+                    }
+                )
+            else:
+                content_blocks.append(
+                    {
+                        "type": "image",
+                        "inline_data": inline_data,
+                    }
+                )
 
-    return "".join(content_parts)
+    return content_blocks
 
 
 def _format_dict_message(item: Dict[str, Any]) -> FormattedMessage:
@@ -73,16 +114,17 @@ def _format_dict_message(item: Dict[str, Any]) -> FormattedMessage:
 
     # Handle dict format with parts array (Gemini-specific format)
     if "parts" in item and isinstance(item["parts"], list):
-        content = _extract_text_from_parts(item["parts"])
-        return {"role": item.get("role", "user"), "content": content}
+        content_blocks = _format_parts_as_content_blocks(item["parts"])
+        return {"role": item.get("role", "user"), "content": content_blocks}
 
     # Handle dict with content field
     if "content" in item:
         content = item["content"]
 
         if isinstance(content, list):
-            # If content is a list, extract text from it
-            content = _extract_text_from_parts(content)
+            # If content is a list, format it as content blocks
+            content_blocks = _format_parts_as_content_blocks(content)
+            return {"role": item.get("role", "user"), "content": content_blocks}
 
         elif not isinstance(content, str):
             content = str(content)
@@ -110,14 +152,14 @@ def _format_object_message(item: Any) -> FormattedMessage:
 
     # Handle object with parts attribute
     if hasattr(item, "parts") and hasattr(item.parts, "__iter__"):
-        content = _extract_text_from_parts(item.parts)
+        content_blocks = _format_parts_as_content_blocks(list(item.parts))
         role = getattr(item, "role", "user") if hasattr(item, "role") else "user"
 
         # Ensure role is a string
         if not isinstance(role, str):
             role = "user"
 
-        return {"role": role, "content": content}
+        return {"role": role, "content": content_blocks}
 
     # Handle object with text attribute
     if hasattr(item, "text"):
@@ -140,7 +182,8 @@ def _format_object_message(item: Any) -> FormattedMessage:
         content = item.content
 
         if isinstance(content, list):
-            content = _extract_text_from_parts(content)
+            content_blocks = _format_parts_as_content_blocks(content)
+            return {"role": role, "content": content_blocks}
 
         elif not isinstance(content, str):
             content = str(content)
@@ -193,6 +236,29 @@ def format_gemini_response(response: Any) -> List[FormattedMessage]:
                                 }
                             )
 
+                        elif hasattr(part, "inline_data") and part.inline_data:
+                            # Handle audio/media inline data
+                            import base64
+
+                            inline_data = part.inline_data
+                            mime_type = getattr(inline_data, "mime_type", "audio/pcm")
+                            raw_data = getattr(inline_data, "data", b"")
+
+                            # Encode binary data as base64 string for JSON serialization
+                            if isinstance(raw_data, bytes):
+                                data = base64.b64encode(raw_data).decode("utf-8")
+                            else:
+                                # Already a string (base64)
+                                data = raw_data
+
+                            content.append(
+                                {
+                                    "type": "audio",
+                                    "mime_type": mime_type,
+                                    "data": data,
+                                }
+                            )
+
                 if content:
                     output.append(
                         {
diff --git a/posthog/ai/openai/openai_converter.py b/posthog/ai/openai/openai_converter.py
index e7b8ce5e..5b7eac8c 100644
--- a/posthog/ai/openai/openai_converter.py
+++ b/posthog/ai/openai/openai_converter.py
@@ -67,6 +67,12 @@ def format_openai_response(response: Any) -> List[FormattedMessage]:
                             }
                         )
 
+                # Handle audio output (gpt-4o-audio-preview)
+                if hasattr(choice.message, "audio") and choice.message.audio:
+                    # Convert Pydantic model to dict to capture all fields from OpenAI
+                    audio_dict = choice.message.audio.model_dump()
+                    content.append({"type": "audio", **audio_dict})
+
         if content:
             output.append(
                 {
diff --git a/posthog/ai/sanitization.py b/posthog/ai/sanitization.py
index a0953d07..5f259c56 100644
--- a/posthog/ai/sanitization.py
+++ b/posthog/ai/sanitization.py
@@ -1,3 +1,4 @@
+import os
 import re
 from typing import Any
 from urllib.parse import urlparse
@@ -5,6 +6,15 @@
 REDACTED_IMAGE_PLACEHOLDER = "[base64 image redacted]"
 
 
+def _is_multimodal_enabled() -> bool:
+    """Check if multimodal capture is enabled via environment variable."""
+    return os.environ.get("_INTERNAL_LLMA_MULTIMODAL", "").lower() in (
+        "true",
+        "1",
+        "yes",
+    )
+
+
 def is_base64_data_url(text: str) -> bool:
     return re.match(r"^data:([^;]+);base64,", text) is not None
 
@@ -27,6 +37,9 @@ def is_raw_base64(text: str) -> bool:
 
 
 def redact_base64_data_url(value: Any) -> Any:
+    if _is_multimodal_enabled():
+        return value
+
     if not isinstance(value, str):
         return value
 
@@ -83,6 +96,11 @@ def sanitize_openai_image(item: Any) -> Any:
             },
         }
 
+    if item.get("type") == "audio" and "data" in item:
+        if _is_multimodal_enabled():
+            return item
+        return {**item, "data": REDACTED_IMAGE_PLACEHOLDER}
+
     return item
 
 
@@ -100,6 +118,9 @@ def sanitize_openai_response_image(item: Any) -> Any:
 
 
 def sanitize_anthropic_image(item: Any) -> Any:
+    if _is_multimodal_enabled():
+        return item
+
     if not isinstance(item, dict):
         return item
 
@@ -109,8 +130,6 @@ def sanitize_anthropic_image(item: Any) -> Any:
         and item["source"].get("type") == "base64"
         and "data" in item["source"]
     ):
-        # For Anthropic, if the source type is "base64", we should always redact the data
-        # The provider is explicitly telling us this is base64 data
         return {
             **item,
             "source": {
@@ -123,6 +142,9 @@ def sanitize_anthropic_image(item: Any) -> Any:
 
 
 def sanitize_gemini_part(part: Any) -> Any:
+    if _is_multimodal_enabled():
+        return part
+
     if not isinstance(part, dict):
         return part
 
@@ -131,8 +153,6 @@ def sanitize_gemini_part(part: Any) -> Any:
         and isinstance(part["inline_data"], dict)
         and "data" in part["inline_data"]
     ):
-        # For Gemini, the inline_data structure indicates base64 data
-        # We should redact any string data in this context
         return {
             **part,
             "inline_data": {
@@ -185,7 +205,9 @@ def sanitize_langchain_image(item: Any) -> Any:
         and isinstance(item.get("source"), dict)
         and "data" in item["source"]
     ):
-        # Anthropic style - raw base64 in structured format, always redact
+        if _is_multimodal_enabled():
+            return item
+
         return {
             **item,
             "source": {
diff --git a/posthog/test/ai/gemini/test_gemini.py b/posthog/test/ai/gemini/test_gemini.py
index e0c216a0..0cf763ba 100644
--- a/posthog/test/ai/gemini/test_gemini.py
+++ b/posthog/test/ai/gemini/test_gemini.py
@@ -407,7 +407,9 @@ def test_new_client_different_input_formats(
     )
     call_args = mock_client.capture.call_args[1]
     props = call_args["properties"]
-    assert props["$ai_input"] == [{"role": "user", "content": "hey"}]
+    assert props["$ai_input"] == [
+        {"role": "user", "content": [{"type": "text", "text": "hey"}]}
+    ]
 
     # Test multiple parts in the parts array
     mock_client.reset_mock()
@@ -418,7 +420,15 @@ def test_new_client_different_input_formats(
     )
     call_args = mock_client.capture.call_args[1]
     props = call_args["properties"]
-    assert props["$ai_input"] == [{"role": "user", "content": "Hello world"}]
+    assert props["$ai_input"] == [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Hello "},
+                {"type": "text", "text": "world"},
+            ],
+        }
+    ]
 
     # Test list input with string
     mock_client.capture.reset_mock()
diff --git a/posthog/test/ai/gemini/test_gemini_async.py b/posthog/test/ai/gemini/test_gemini_async.py
index 624095f9..ad7decf3 100644
--- a/posthog/test/ai/gemini/test_gemini_async.py
+++ b/posthog/test/ai/gemini/test_gemini_async.py
@@ -392,7 +392,9 @@ async def test_async_client_different_input_formats(
     )
     call_args = mock_client.capture.call_args[1]
     props = call_args["properties"]
-    assert props["$ai_input"] == [{"role": "user", "content": "hey"}]
+    assert props["$ai_input"] == [
+        {"role": "user", "content": [{"type": "text", "text": "hey"}]}
+    ]
 
     # Test multiple parts in the parts array
     mock_client.reset_mock()
@@ -403,7 +405,15 @@ async def test_async_client_different_input_formats(
     )
     call_args = mock_client.capture.call_args[1]
     props = call_args["properties"]
-    assert props["$ai_input"] == [{"role": "user", "content": "Hello world"}]
+    assert props["$ai_input"] == [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Hello "},
+                {"type": "text", "text": "world"},
+            ],
+        }
+    ]
 
     # Test list input with string
     mock_client.capture.reset_mock()
diff --git a/posthog/test/ai/test_sanitization.py b/posthog/test/ai/test_sanitization.py
index 0031bafb..24d1f2ea 100644
--- a/posthog/test/ai/test_sanitization.py
+++ b/posthog/test/ai/test_sanitization.py
@@ -1,3 +1,4 @@
+import os
 import unittest
 
 from posthog.ai.sanitization import (
@@ -331,5 +332,191 @@ def test_sanitize_handles_single_message(self):
         )
 
 
+class TestAIMultipartRequest(unittest.TestCase):
+    """Test that _INTERNAL_LLMA_MULTIMODAL environment variable controls sanitization."""
+
+    def tearDown(self):
+        # Clean up environment variable after each test
+        if "_INTERNAL_LLMA_MULTIMODAL" in os.environ:
+            del os.environ["_INTERNAL_LLMA_MULTIMODAL"]
+
+    def test_multimodal_disabled_redacts_images(self):
+        """When _INTERNAL_LLMA_MULTIMODAL is not set, images should be redacted."""
+        if "_INTERNAL_LLMA_MULTIMODAL" in os.environ:
+            del os.environ["_INTERNAL_LLMA_MULTIMODAL"]
+
+        base64_image = "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQ..."
+        result = redact_base64_data_url(base64_image)
+        self.assertEqual(result, REDACTED_IMAGE_PLACEHOLDER)
+
+    def test_multimodal_enabled_preserves_images(self):
+        """When _INTERNAL_LLMA_MULTIMODAL is true, images should be preserved."""
+        os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "true"
+
+        base64_image = "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQ..."
+        result = redact_base64_data_url(base64_image)
+        self.assertEqual(result, base64_image)
+
+    def test_multimodal_enabled_with_1(self):
+        """_INTERNAL_LLMA_MULTIMODAL=1 should enable multimodal."""
+        os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "1"
+
+        base64_image = "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQ..."
+        result = redact_base64_data_url(base64_image)
+        self.assertEqual(result, base64_image)
+
+    def test_multimodal_enabled_with_yes(self):
+        """_INTERNAL_LLMA_MULTIMODAL=yes should enable multimodal."""
+        os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "yes"
+
+        base64_image = "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQ..."
+        result = redact_base64_data_url(base64_image)
+        self.assertEqual(result, base64_image)
+
+    def test_multimodal_false_redacts_images(self):
+        """_INTERNAL_LLMA_MULTIMODAL=false should still redact."""
+        os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "false"
+
+        base64_image = "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQ..."
+        result = redact_base64_data_url(base64_image)
+        self.assertEqual(result, REDACTED_IMAGE_PLACEHOLDER)
+
+    def test_anthropic_multimodal_enabled(self):
+        """Anthropic images should be preserved when _INTERNAL_LLMA_MULTIMODAL is enabled."""
+        os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "true"
+
+        input_data = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": "image/jpeg",
+                            "data": "base64data",
+                        },
+                    }
+                ],
+            }
+        ]
+
+        result = sanitize_anthropic(input_data)
+        self.assertEqual(result[0]["content"][0]["source"]["data"], "base64data")
+
+    def test_gemini_multimodal_enabled(self):
+        """Gemini images should be preserved when _INTERNAL_LLMA_MULTIMODAL is enabled."""
+        os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "true"
+
+        input_data = [
+            {
+                "parts": [
+                    {"inline_data": {"mime_type": "image/jpeg", "data": "base64data"}}
+                ]
+            }
+        ]
+
+        result = sanitize_gemini(input_data)
+        self.assertEqual(result[0]["parts"][0]["inline_data"]["data"], "base64data")
+
+    def test_langchain_anthropic_style_multimodal_enabled(self):
+        """LangChain Anthropic-style images should be preserved when _INTERNAL_LLMA_MULTIMODAL is enabled."""
+        os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "true"
+
+        input_data = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "source": {"data": "base64data"},
+                    }
+                ],
+            }
+        ]
+
+        result = sanitize_langchain(input_data)
+        self.assertEqual(result[0]["content"][0]["source"]["data"], "base64data")
+
+    def test_openai_audio_redacted_by_default(self):
+        """OpenAI audio should be redacted when _INTERNAL_LLMA_MULTIMODAL is not set."""
+        if "_INTERNAL_LLMA_MULTIMODAL" in os.environ:
+            del os.environ["_INTERNAL_LLMA_MULTIMODAL"]
+
+        input_data = [
+            {
+                "role": "assistant",
+                "content": [
+                    {"type": "audio", "data": "base64audiodata", "id": "audio_123"}
+                ],
+            }
+        ]
+
+        result = sanitize_openai(input_data)
+        self.assertEqual(result[0]["content"][0]["data"], REDACTED_IMAGE_PLACEHOLDER)
+        self.assertEqual(result[0]["content"][0]["id"], "audio_123")
+
+    def test_openai_audio_preserved_with_flag(self):
+        """OpenAI audio should be preserved when _INTERNAL_LLMA_MULTIMODAL is enabled."""
+        os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "true"
+
+        input_data = [
+            {
+                "role": "assistant",
+                "content": [
+                    {"type": "audio", "data": "base64audiodata", "id": "audio_123"}
+                ],
+            }
+        ]
+
+        result = sanitize_openai(input_data)
+        self.assertEqual(result[0]["content"][0]["data"], "base64audiodata")
+
+    def test_gemini_audio_redacted_by_default(self):
+        """Gemini audio should be redacted when _INTERNAL_LLMA_MULTIMODAL is not set."""
+        if "_INTERNAL_LLMA_MULTIMODAL" in os.environ:
+            del os.environ["_INTERNAL_LLMA_MULTIMODAL"]
+
+        input_data = [
+            {
+                "parts": [
+                    {
+                        "inline_data": {
+                            "mime_type": "audio/L16;codec=pcm;rate=24000",
+                            "data": "base64audiodata",
+                        }
+                    }
+                ]
+            }
+        ]
+
+        result = sanitize_gemini(input_data)
+        self.assertEqual(
+            result[0]["parts"][0]["inline_data"]["data"], REDACTED_IMAGE_PLACEHOLDER
+        )
+
+    def test_gemini_audio_preserved_with_flag(self):
+        """Gemini audio should be preserved when _INTERNAL_LLMA_MULTIMODAL is enabled."""
+        os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "true"
+
+        input_data = [
+            {
+                "parts": [
+                    {
+                        "inline_data": {
+                            "mime_type": "audio/L16;codec=pcm;rate=24000",
+                            "data": "base64audiodata",
+                        }
+                    }
+                ]
+            }
+        ]
+
+        result = sanitize_gemini(input_data)
+        self.assertEqual(
+            result[0]["parts"][0]["inline_data"]["data"], "base64audiodata"
+        )
+
+
 if __name__ == "__main__":
     unittest.main()