run-llama · Florian-BACHO · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,52 +5,67 @@
 ## [2025-10-26]
 
 ### llama-index-core [0.14.6]
+
 - Add allow_parallel_tool_calls for non-streaming ([#20117](https://github.com/run-llama/llama_index/pull/20117))
 - Fix invalid use of field-specific metadata ([#20122](https://github.com/run-llama/llama_index/pull/20122))
 - update doc for SemanticSplitterNodeParser ([#20125](https://github.com/run-llama/llama_index/pull/20125))
 - fix rare cases when sentence splits are larger than chunk size ([#20147](https://github.com/run-llama/llama_index/pull/20147))
 
 ### llama-index-embeddings-bedrock [0.7.0]
+
 - Fix BedrockEmbedding to support Cohere v4 response format ([#20094](https://github.com/run-llama/llama_index/pull/20094))
 
 ### llama-index-embeddings-isaacus [0.1.0]
+
 - feat: Isaacus embeddings integration ([#20124](https://github.com/run-llama/llama_index/pull/20124))
 
 ### llama-index-embeddings-oci-genai [0.4.2]
+
 - Update OCI GenAI cohere models ([#20146](https://github.com/run-llama/llama_index/pull/20146))
 
 ### llama-index-llms-anthropic [0.9.7]
+
 - Fix double token stream in anthropic llm ([#20108](https://github.com/run-llama/llama_index/pull/20108))
 - Ensure anthropic content delta only has user facing response ([#20113](https://github.com/run-llama/llama_index/pull/20113))
 
 ### llama-index-llms-baseten [0.1.7]
+
 - add GLM ([#20121](https://github.com/run-llama/llama_index/pull/20121))
 
 ### llama-index-llms-helicone [0.1.0]
+
 - integrate helicone to llama-index ([#20131](https://github.com/run-llama/llama_index/pull/20131))
 
 ### llama-index-llms-oci-genai [0.6.4]
+
 - Update OCI GenAI cohere models ([#20146](https://github.com/run-llama/llama_index/pull/20146))
 
 ### llama-index-llms-openai [0.6.5]
+
 - chore: openai vbump ([#20095](https://github.com/run-llama/llama_index/pull/20095))
 
 ### llama-index-readers-imdb-review [0.4.2]
+
 - chore: Update selenium dependency in imdb-review reader ([#20105](https://github.com/run-llama/llama_index/pull/20105))
 
 ### llama-index-retrievers-bedrock [0.5.0]
+
 - feat(bedrock): add async support for AmazonKnowledgeBasesRetriever ([#20114](https://github.com/run-llama/llama_index/pull/20114))
 
 ### llama-index-retrievers-superlinked [0.1.3]
+
 - Update README.md ([#19829](https://github.com/run-llama/llama_index/pull/19829))
 
 ### llama-index-storage-kvstore-postgres [0.4.2]
+
 - fix: Replace raw SQL string interpolation with proper SQLAlchemy parameterized APIs in PostgresKVStore ([#20104](https://github.com/run-llama/llama_index/pull/20104))
 
 ### llama-index-tools-mcp [0.4.3]
+
 - Fix BasicMCPClient resource signatures ([#20118](https://github.com/run-llama/llama_index/pull/20118))
 
 ### llama-index-vector-stores-postgres [0.7.1]
+
 - Add GIN index support for text array metadata in PostgreSQL vector store ([#20130](https://github.com/run-llama/llama_index/pull/20130))
 
 ## [2025-10-15]

diff --git a/...dex-integrations/llms/llama-index-llms-google-genai/llama_index/llms/google_genai/base.py b/...dex-integrations/llms/llama-index-llms-google-genai/llama_index/llms/google_genai/base.py
@@ -16,6 +16,7 @@
     Type,
     Union,
     Callable,
+    Literal,
 )
 
 
@@ -139,9 +140,9 @@ class GoogleGenAI(FunctionCallingLLM):
         default=None,
         description="Google GenAI tool to use for the model to augment responses.",
     )
-    use_file_api: bool = Field(
-        default=True,
-        description="Whether or not to use the FileAPI for large files (>20MB).",
+    file_mode: Literal["inline", "fileapi", "hybrid"] = Field(
+        default="hybrid",
+        description="Whether to use inline-only, FileAPI-only or both for handling files.",
     )
 
     _max_tokens: int = PrivateAttr()
@@ -165,7 +166,7 @@ def __init__(
         is_function_calling_model: bool = True,
         cached_content: Optional[str] = None,
         built_in_tool: Optional[types.Tool] = None,
-        use_file_api: bool = True,
+        file_mode: Literal["inline", "fileapi", "hybrid"] = "hybrid",
         **kwargs: Any,
     ):
         # API keys are optional. The API can be authorised via OAuth (detected
@@ -214,7 +215,7 @@ def __init__(
             max_retries=max_retries,
             cached_content=cached_content,
             built_in_tool=built_in_tool,
-            use_file_api=use_file_api,
+            file_mode=file_mode,
             **kwargs,
         )
 
@@ -307,20 +308,17 @@ def _chat(self, messages: Sequence[ChatMessage], **kwargs: Any):
             **kwargs.pop("generation_config", {}),
         }
         params = {**kwargs, "generation_config": generation_config}
-        next_msg, chat_kwargs = asyncio.run(
+        next_msg, chat_kwargs, file_api_names = asyncio.run(
             prepare_chat_params(
-                self.model, messages, self.use_file_api, self._client, **params
+                self.model, messages, self.file_mode, self._client, **params
             )
         )
         chat = self._client.chats.create(**chat_kwargs)
         response = chat.send_message(
             next_msg.parts if isinstance(next_msg, types.Content) else next_msg
         )
 
-        if self.use_file_api:
-            asyncio.run(
-                delete_uploaded_files([*chat_kwargs["history"], next_msg], self._client)
-            )
+        asyncio.run(delete_uploaded_files(file_api_names, self._client))
 
         return chat_from_gemini_response(response)
 
@@ -331,18 +329,15 @@ async def _achat(self, messages: Sequence[ChatMessage], **kwargs: Any):
             **kwargs.pop("generation_config", {}),
         }
         params = {**kwargs, "generation_config": generation_config}
-        next_msg, chat_kwargs = await prepare_chat_params(
-            self.model, messages, self.use_file_api, self._client, **params
+        next_msg, chat_kwargs, file_api_names = await prepare_chat_params(
+            self.model, messages, self.file_mode, self._client, **params
         )
         chat = self._client.aio.chats.create(**chat_kwargs)
         response = await chat.send_message(
             next_msg.parts if isinstance(next_msg, types.Content) else next_msg
         )
 
-        if self.use_file_api:
-            await delete_uploaded_files(
-                [*chat_kwargs["history"], next_msg], self._client
-            )
+        await delete_uploaded_files(file_api_names, self._client)
 
         return chat_from_gemini_response(response)
 
@@ -364,9 +359,9 @@ def _stream_chat(
             **kwargs.pop("generation_config", {}),
         }
         params = {**kwargs, "generation_config": generation_config}
-        next_msg, chat_kwargs = asyncio.run(
+        next_msg, chat_kwargs, file_api_names = asyncio.run(
             prepare_chat_params(
-                self.model, messages, self.use_file_api, self._client, **params
+                self.model, messages, self.file_mode, self._client, **params
             )
         )
         chat = self._client.chats.create(**chat_kwargs)
@@ -399,12 +394,8 @@ def gen() -> ChatResponseGen:
                 llama_resp.message.additional_kwargs["tool_calls"] = existing_tool_calls
                 yield llama_resp
 
-            if self.use_file_api:
-                asyncio.run(
-                    delete_uploaded_files(
-                        [*chat_kwargs["history"], next_msg], self._client
-                    )
-                )
+            if self.file_mode in ("fileapi", "hybrid"):
+                asyncio.run(delete_uploaded_files(file_api_names, self._client))
 
         return gen()
 
@@ -422,8 +413,8 @@ async def _astream_chat(
             **kwargs.pop("generation_config", {}),
         }
         params = {**kwargs, "generation_config": generation_config}
-        next_msg, chat_kwargs = await prepare_chat_params(
-            self.model, messages, self.use_file_api, self._client, **params
+        next_msg, chat_kwargs, file_api_names = await prepare_chat_params(
+            self.model, messages, self.file_mode, self._client, **params
         )
         chat = self._client.aio.chats.create(**chat_kwargs)
 
@@ -463,10 +454,7 @@ async def gen() -> ChatResponseAsyncGen:
                             )
                             yield llama_resp
 
-            if self.use_file_api:
-                await delete_uploaded_files(
-                    [*chat_kwargs["history"], next_msg], self._client
-                )
+            await delete_uploaded_files(file_api_names, self._client)
 
         return gen()
 
@@ -585,12 +573,13 @@ def structured_predict_without_function_calling(
         llm_kwargs = llm_kwargs or {}
 
         messages = prompt.format_messages(**prompt_args)
-        contents = [
-            asyncio.run(
-                chat_message_to_gemini(message, self.use_file_api, self._client)
-            )
+        contents_and_names = [
+            asyncio.run(chat_message_to_gemini(message, self.file_mode, self._client))
             for message in messages
         ]
+        contents = [it[0] for it in contents_and_names]
+        file_api_names = [name for it in contents_and_names for name in it[1]]
+
         response = self._client.models.generate_content(
             model=self.model,
             contents=contents,
@@ -605,8 +594,7 @@ def structured_predict_without_function_calling(
             },
         )
 
-        if self.use_file_api:
-            asyncio.run(delete_uploaded_files(contents, self._client))
+        asyncio.run(delete_uploaded_files(file_api_names, self._client))
 
         if isinstance(response.parsed, BaseModel):
             return response.parsed
@@ -635,20 +623,22 @@ def structured_predict(
             generation_config["response_schema"] = output_cls
 
             messages = prompt.format_messages(**prompt_args)
-            contents = [
+            contents_and_names = [
                 asyncio.run(
-                    chat_message_to_gemini(message, self.use_file_api, self._client)
+                    chat_message_to_gemini(message, self.file_mode, self._client)
                 )
                 for message in messages
             ]
+            contents = [it[0] for it in contents_and_names]
+            file_api_names = [name for it in contents_and_names for name in it[1]]
+
             response = self._client.models.generate_content(
                 model=self.model,
                 contents=contents,
                 config=generation_config,
             )
 
-            if self.use_file_api:
-                asyncio.run(delete_uploaded_files(contents, self._client))
+            asyncio.run(delete_uploaded_files(file_api_names, self._client))
 
             if isinstance(response.parsed, BaseModel):
                 return response.parsed
@@ -682,20 +672,22 @@ async def astructured_predict(
             generation_config["response_schema"] = output_cls
 
             messages = prompt.format_messages(**prompt_args)
-            contents = await asyncio.gather(
+            contents_and_names = await asyncio.gather(
                 *[
-                    chat_message_to_gemini(message, self.use_file_api, self._client)
+                    chat_message_to_gemini(message, self.file_mode, self._client)
                     for message in messages
                 ]
             )
+            contents = [it[0] for it in contents_and_names]
+            file_api_names = [name for it in contents_and_names for name in it[1]]
+
             response = await self._client.aio.models.generate_content(
                 model=self.model,
                 contents=contents,
                 config=generation_config,
             )
 
-            if self.use_file_api:
-                await delete_uploaded_files(contents, self._client)
+            await delete_uploaded_files(file_api_names, self._client)
 
             if isinstance(response.parsed, BaseModel):
                 return response.parsed
@@ -729,12 +721,14 @@ def stream_structured_predict(
             generation_config["response_schema"] = output_cls
 
             messages = prompt.format_messages(**prompt_args)
-            contents = [
+            contents_and_names = [
                 asyncio.run(
-                    chat_message_to_gemini(message, self.use_file_api, self._client)
+                    chat_message_to_gemini(message, self.file_mode, self._client)
                 )
                 for message in messages
             ]
+            contents = [it[0] for it in contents_and_names]
+            file_api_names = [name for it in contents_and_names for name in it[1]]
 
             def gen() -> Generator[Union[Model, FlexibleModel], None, None]:
                 flexible_model = create_flexible_model(output_cls)
@@ -758,8 +752,7 @@ def gen() -> Generator[Union[Model, FlexibleModel], None, None]:
                         if streaming_model:
                             yield streaming_model
 
-                if self.use_file_api:
-                    asyncio.run(delete_uploaded_files(contents, self._client))
+                asyncio.run(delete_uploaded_files(file_api_names, self._client))
 
             return gen()
         else:
@@ -789,12 +782,14 @@ async def astream_structured_predict(
             generation_config["response_schema"] = output_cls
 
             messages = prompt.format_messages(**prompt_args)
-            contents = await asyncio.gather(
+            contents_and_names = await asyncio.gather(
                 *[
-                    chat_message_to_gemini(message, self.use_file_api, self._client)
+                    chat_message_to_gemini(message, self.file_mode, self._client)
                     for message in messages
                 ]
             )
+            contents = [it[0] for it in contents_and_names]
+            file_api_names = [name for it in contents_and_names for name in it[1]]
 
             async def gen() -> AsyncGenerator[Union[Model, FlexibleModel], None]:
                 flexible_model = create_flexible_model(output_cls)
@@ -818,8 +813,7 @@ async def gen() -> AsyncGenerator[Union[Model, FlexibleModel], None]:
                         if streaming_model:
                             yield streaming_model
 
-                if self.use_file_api:
-                    await delete_uploaded_files(contents, self._client)
+                await delete_uploaded_files(file_api_names, self._client)
 
             return gen()
         else: