From 80f8ca183834668b2a8d6147e05849f0ebdff768 Mon Sep 17 00:00:00 2001 From: HoangNB Date: Wed, 12 Feb 2025 11:29:38 +0700 Subject: [PATCH 1/2] Add rate limiting configuration for LLM providers - Introduce rate limit parameters (requests/sec and max bucket size) for all LLM providers - Update webui.py to include new rate limit inputs in UI and function signatures - Modify utils.py to create InMemoryRateLimiter for each LLM provider - Add rate limiter configuration to all supported LLM models --- src/utils/utils.py | 81 ++++++++++++++++++++++++++++++++++++++++++++-- webui.py | 40 +++++++++++++++++++---- 2 files changed, 112 insertions(+), 9 deletions(-) diff --git a/src/utils/utils.py b/src/utils/utils.py index dc949ce3..24663c01 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -11,6 +11,7 @@ from langchain_ollama import ChatOllama from langchain_openai import AzureChatOpenAI, ChatOpenAI import gradio as gr +from langchain_core.rate_limiters import InMemoryRateLimiter from .llm import DeepSeekR1ChatOpenAI, DeepSeekR1ChatOllama @@ -29,6 +30,9 @@ def get_llm_model(provider: str, **kwargs): :param kwargs: :return: """ + rate_limit_rps = kwargs.get("rate_limit_rps", 1.0) + rate_limit_bucket = kwargs.get("rate_limit_bucket", 10) + if provider not in ["ollama"]: env_var = f"{provider.upper()}_API_KEY" api_key = kwargs.get("api_key", "") or os.getenv(env_var, "") @@ -42,11 +46,21 @@ def get_llm_model(provider: str, **kwargs): else: base_url = kwargs.get("base_url") + # Create rate limiter + rate_limiter = InMemoryRateLimiter( + requests_per_second=rate_limit_rps, + check_every_n_seconds=0.1, + max_bucket_size=rate_limit_bucket + ) + return ChatAnthropic( model_name=kwargs.get("model_name", "claude-3-5-sonnet-20240620"), temperature=kwargs.get("temperature", 0.0), base_url=base_url, api_key=api_key, + timeout=kwargs.get("timeout", 60), + stop_sequences=kwargs.get("stop", []), + rate_limiter=rate_limiter, ) elif provider == 'mistral': if not kwargs.get("base_url", ""): @@ -58,11 +72,19 @@ def get_llm_model(provider: str, **kwargs): else: api_key = kwargs.get("api_key") + # Create rate limiter + rate_limiter = InMemoryRateLimiter( + requests_per_second=rate_limit_rps, + check_every_n_seconds=0.1, + max_bucket_size=rate_limit_bucket + ) + return ChatMistralAI( - model=kwargs.get("model_name", "mistral-large-latest"), + model_name=kwargs.get("model_name", "mistral-large-latest"), temperature=kwargs.get("temperature", 0.0), base_url=base_url, api_key=api_key, + rate_limiter=rate_limiter, ) elif provider == "openai": if not kwargs.get("base_url", ""): @@ -70,11 +92,19 @@ def get_llm_model(provider: str, **kwargs): else: base_url = kwargs.get("base_url") + # Create rate limiter + rate_limiter = InMemoryRateLimiter( + requests_per_second=rate_limit_rps, + check_every_n_seconds=0.1, + max_bucket_size=rate_limit_bucket + ) + return ChatOpenAI( model=kwargs.get("model_name", "gpt-4o"), temperature=kwargs.get("temperature", 0.0), base_url=base_url, api_key=api_key, + rate_limiter=rate_limiter, ) elif provider == "deepseek": if not kwargs.get("base_url", ""): @@ -83,24 +113,48 @@ def get_llm_model(provider: str, **kwargs): base_url = kwargs.get("base_url") if kwargs.get("model_name", "deepseek-chat") == "deepseek-reasoner": + # Create rate limiter + rate_limiter = InMemoryRateLimiter( + requests_per_second=rate_limit_rps, + check_every_n_seconds=0.1, + max_bucket_size=rate_limit_bucket + ) + return DeepSeekR1ChatOpenAI( model=kwargs.get("model_name", "deepseek-reasoner"), temperature=kwargs.get("temperature", 0.0), base_url=base_url, api_key=api_key, + rate_limiter=rate_limiter, ) else: + # Create rate limiter + rate_limiter = InMemoryRateLimiter( + requests_per_second=rate_limit_rps, + check_every_n_seconds=0.1, + max_bucket_size=rate_limit_bucket + ) + return ChatOpenAI( model=kwargs.get("model_name", "deepseek-chat"), temperature=kwargs.get("temperature", 0.0), base_url=base_url, api_key=api_key, + rate_limiter=rate_limiter, ) elif provider == "google": + # Create rate limiter + rate_limiter = InMemoryRateLimiter( + requests_per_second=rate_limit_rps, + check_every_n_seconds=0.1, + max_bucket_size=rate_limit_bucket + ) + return ChatGoogleGenerativeAI( model=kwargs.get("model_name", "gemini-2.0-flash-exp"), temperature=kwargs.get("temperature", 0.0), - google_api_key=api_key, + api_key=api_key, + rate_limiter=rate_limiter, ) elif provider == "ollama": if not kwargs.get("base_url", ""): @@ -109,19 +163,35 @@ def get_llm_model(provider: str, **kwargs): base_url = kwargs.get("base_url") if "deepseek-r1" in kwargs.get("model_name", "qwen2.5:7b"): + # Create rate limiter + rate_limiter = InMemoryRateLimiter( + requests_per_second=rate_limit_rps, + check_every_n_seconds=0.1, + max_bucket_size=rate_limit_bucket + ) + return DeepSeekR1ChatOllama( model=kwargs.get("model_name", "deepseek-r1:14b"), temperature=kwargs.get("temperature", 0.0), num_ctx=kwargs.get("num_ctx", 32000), base_url=base_url, + rate_limiter=rate_limiter, ) else: + # Create rate limiter + rate_limiter = InMemoryRateLimiter( + requests_per_second=rate_limit_rps, + check_every_n_seconds=0.1, + max_bucket_size=rate_limit_bucket + ) + return ChatOllama( model=kwargs.get("model_name", "qwen2.5:7b"), temperature=kwargs.get("temperature", 0.0), num_ctx=kwargs.get("num_ctx", 32000), num_predict=kwargs.get("num_predict", 1024), base_url=base_url, + rate_limiter=rate_limiter, ) elif provider == "azure_openai": if not kwargs.get("base_url", ""): @@ -129,12 +199,19 @@ def get_llm_model(provider: str, **kwargs): else: base_url = kwargs.get("base_url") api_version = kwargs.get("api_version", "") or os.getenv("AZURE_OPENAI_API_VERSION", "2025-01-01-preview") + # Create rate limiter + rate_limiter = InMemoryRateLimiter( + requests_per_second=rate_limit_rps, + check_every_n_seconds=0.1, + max_bucket_size=rate_limit_bucket + ) return AzureChatOpenAI( model=kwargs.get("model_name", "gpt-4o"), temperature=kwargs.get("temperature", 0.0), api_version=api_version, azure_endpoint=base_url, api_key=api_key, + rate_limiter=rate_limiter, ) else: raise ValueError(f"Unsupported provider: {provider}") diff --git a/webui.py b/webui.py index 8e9d6b20..34bc409e 100644 --- a/webui.py +++ b/webui.py @@ -117,7 +117,9 @@ async def run_browser_agent( max_steps, use_vision, max_actions_per_step, - tool_calling_method + tool_calling_method, + rate_limit_rps, + rate_limit_bucket ): global _global_agent_state _global_agent_state.clear_stop() # Clear any previous stop requests @@ -146,6 +148,8 @@ async def run_browser_agent( temperature=llm_temperature, base_url=llm_base_url, api_key=llm_api_key, + rate_limit_rps=rate_limit_rps, + rate_limit_bucket=rate_limit_bucket ) if agent_type == "org": final_result, errors, model_actions, model_thoughts, trace_file, history_file = await run_org_agent( @@ -449,7 +453,9 @@ async def run_with_stream( max_steps, use_vision, max_actions_per_step, - tool_calling_method + tool_calling_method, + rate_limit_rps, + rate_limit_bucket ): global _global_agent_state stream_vw = 80 @@ -477,7 +483,9 @@ async def run_with_stream( max_steps=max_steps, use_vision=use_vision, max_actions_per_step=max_actions_per_step, - tool_calling_method=tool_calling_method + tool_calling_method=tool_calling_method, + rate_limit_rps=rate_limit_rps, + rate_limit_bucket=rate_limit_bucket ) # Add HTML content at the start of the result array html_content = f"

Using browser...

" @@ -509,7 +517,9 @@ async def run_with_stream( max_steps=max_steps, use_vision=use_vision, max_actions_per_step=max_actions_per_step, - tool_calling_method=tool_calling_method + tool_calling_method=tool_calling_method, + rate_limit_rps=rate_limit_rps, + rate_limit_bucket=rate_limit_bucket ) ) @@ -623,7 +633,7 @@ async def close_global_browser(): await _global_browser.close() _global_browser = None -async def run_deep_search(research_task, max_search_iteration_input, max_query_per_iter_input, llm_provider, llm_model_name, llm_temperature, llm_base_url, llm_api_key, use_vision, use_own_browser, headless): +async def run_deep_search(research_task, max_search_iteration_input, max_query_per_iter_input, llm_provider, llm_model_name, llm_temperature, llm_base_url, llm_api_key, use_vision, use_own_browser, headless, rate_limit_rps, rate_limit_bucket): from src.utils.deep_research import deep_research global _global_agent_state @@ -636,6 +646,8 @@ async def run_deep_search(research_task, max_search_iteration_input, max_query_p temperature=llm_temperature, base_url=llm_base_url, api_key=llm_api_key, + rate_limit_rps=rate_limit_rps, + rate_limit_bucket=rate_limit_bucket ) markdown_content, file_path = await deep_research(research_task, llm, _global_agent_state, max_search_iterations=max_search_iteration_input, @@ -756,6 +768,19 @@ def create_ui(config, theme_name="Ocean"): value=config['llm_api_key'], info="Your API key (leave blank to use .env)" ) + with gr.Row(): + rate_limit_rps = gr.Number( + label="Requests/sec", + value=config.get('rate_limit_rps', 1), + precision=1, + info="Max requests per second" + ) + rate_limit_bucket = gr.Number( + label="Max Bucket Size", + value=config.get('rate_limit_bucket', 10), + precision=0, + info="Maximum burst capacity" + ) with gr.TabItem("🌐 Browser Settings", id=3): with gr.Group(): @@ -902,7 +927,8 @@ def create_ui(config, theme_name="Ocean"): agent_type, llm_provider, llm_model_name, llm_temperature, llm_base_url, llm_api_key, use_own_browser, keep_browser_open, headless, disable_security, window_w, window_h, save_recording_path, save_agent_history_path, save_trace_path, # Include the new path - enable_recording, task, add_infos, max_steps, use_vision, max_actions_per_step, tool_calling_method + enable_recording, task, add_infos, max_steps, use_vision, max_actions_per_step, tool_calling_method, + rate_limit_rps, rate_limit_bucket ], outputs=[ browser_view, # Browser view @@ -921,7 +947,7 @@ def create_ui(config, theme_name="Ocean"): # Run Deep Research research_button.click( fn=run_deep_search, - inputs=[research_task_input, max_search_iteration_input, max_query_per_iter_input, llm_provider, llm_model_name, llm_temperature, llm_base_url, llm_api_key, use_vision, use_own_browser, headless], + inputs=[research_task_input, max_search_iteration_input, max_query_per_iter_input, llm_provider, llm_model_name, llm_temperature, llm_base_url, llm_api_key, use_vision, use_own_browser, headless, rate_limit_rps, rate_limit_bucket], outputs=[markdown_output_display, markdown_download, stop_research_button, research_button] ) # Bind the stop button click event after errors_output is defined From b797f1f71d51f4f4c52b2de49b3190b9b38e7bc1 Mon Sep 17 00:00:00 2001 From: HoangNB Date: Wed, 12 Feb 2025 12:00:03 +0700 Subject: [PATCH 2/2] Refactor rate limiter initialization in LLM provider configuration - Move rate limiter creation to a single location at the beginning of the function - Update parameter names for specific providers (e.g., Mistral's `model` instead of `model_name`) - Fix Google provider's API key parameter to `google_api_key` - Remove redundant configuration options and simplify model initialization --- src/utils/utils.py | 72 ++++++---------------------------------------- 1 file changed, 9 insertions(+), 63 deletions(-) diff --git a/src/utils/utils.py b/src/utils/utils.py index 24663c01..5a3f4b5e 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -32,7 +32,13 @@ def get_llm_model(provider: str, **kwargs): """ rate_limit_rps = kwargs.get("rate_limit_rps", 1.0) rate_limit_bucket = kwargs.get("rate_limit_bucket", 10) - + # Create rate limiter + rate_limiter = InMemoryRateLimiter( + requests_per_second=rate_limit_rps, + check_every_n_seconds=0.1, + max_bucket_size=rate_limit_bucket + ) + if provider not in ["ollama"]: env_var = f"{provider.upper()}_API_KEY" api_key = kwargs.get("api_key", "") or os.getenv(env_var, "") @@ -46,20 +52,11 @@ def get_llm_model(provider: str, **kwargs): else: base_url = kwargs.get("base_url") - # Create rate limiter - rate_limiter = InMemoryRateLimiter( - requests_per_second=rate_limit_rps, - check_every_n_seconds=0.1, - max_bucket_size=rate_limit_bucket - ) - return ChatAnthropic( model_name=kwargs.get("model_name", "claude-3-5-sonnet-20240620"), temperature=kwargs.get("temperature", 0.0), base_url=base_url, api_key=api_key, - timeout=kwargs.get("timeout", 60), - stop_sequences=kwargs.get("stop", []), rate_limiter=rate_limiter, ) elif provider == 'mistral': @@ -72,15 +69,8 @@ def get_llm_model(provider: str, **kwargs): else: api_key = kwargs.get("api_key") - # Create rate limiter - rate_limiter = InMemoryRateLimiter( - requests_per_second=rate_limit_rps, - check_every_n_seconds=0.1, - max_bucket_size=rate_limit_bucket - ) - return ChatMistralAI( - model_name=kwargs.get("model_name", "mistral-large-latest"), + model=kwargs.get("model_name", "mistral-large-latest"), temperature=kwargs.get("temperature", 0.0), base_url=base_url, api_key=api_key, @@ -92,13 +82,6 @@ def get_llm_model(provider: str, **kwargs): else: base_url = kwargs.get("base_url") - # Create rate limiter - rate_limiter = InMemoryRateLimiter( - requests_per_second=rate_limit_rps, - check_every_n_seconds=0.1, - max_bucket_size=rate_limit_bucket - ) - return ChatOpenAI( model=kwargs.get("model_name", "gpt-4o"), temperature=kwargs.get("temperature", 0.0), @@ -113,13 +96,6 @@ def get_llm_model(provider: str, **kwargs): base_url = kwargs.get("base_url") if kwargs.get("model_name", "deepseek-chat") == "deepseek-reasoner": - # Create rate limiter - rate_limiter = InMemoryRateLimiter( - requests_per_second=rate_limit_rps, - check_every_n_seconds=0.1, - max_bucket_size=rate_limit_bucket - ) - return DeepSeekR1ChatOpenAI( model=kwargs.get("model_name", "deepseek-reasoner"), temperature=kwargs.get("temperature", 0.0), @@ -128,12 +104,6 @@ def get_llm_model(provider: str, **kwargs): rate_limiter=rate_limiter, ) else: - # Create rate limiter - rate_limiter = InMemoryRateLimiter( - requests_per_second=rate_limit_rps, - check_every_n_seconds=0.1, - max_bucket_size=rate_limit_bucket - ) return ChatOpenAI( model=kwargs.get("model_name", "deepseek-chat"), @@ -143,17 +113,11 @@ def get_llm_model(provider: str, **kwargs): rate_limiter=rate_limiter, ) elif provider == "google": - # Create rate limiter - rate_limiter = InMemoryRateLimiter( - requests_per_second=rate_limit_rps, - check_every_n_seconds=0.1, - max_bucket_size=rate_limit_bucket - ) return ChatGoogleGenerativeAI( model=kwargs.get("model_name", "gemini-2.0-flash-exp"), temperature=kwargs.get("temperature", 0.0), - api_key=api_key, + google_api_key=api_key, rate_limiter=rate_limiter, ) elif provider == "ollama": @@ -163,12 +127,6 @@ def get_llm_model(provider: str, **kwargs): base_url = kwargs.get("base_url") if "deepseek-r1" in kwargs.get("model_name", "qwen2.5:7b"): - # Create rate limiter - rate_limiter = InMemoryRateLimiter( - requests_per_second=rate_limit_rps, - check_every_n_seconds=0.1, - max_bucket_size=rate_limit_bucket - ) return DeepSeekR1ChatOllama( model=kwargs.get("model_name", "deepseek-r1:14b"), @@ -178,12 +136,6 @@ def get_llm_model(provider: str, **kwargs): rate_limiter=rate_limiter, ) else: - # Create rate limiter - rate_limiter = InMemoryRateLimiter( - requests_per_second=rate_limit_rps, - check_every_n_seconds=0.1, - max_bucket_size=rate_limit_bucket - ) return ChatOllama( model=kwargs.get("model_name", "qwen2.5:7b"), @@ -199,12 +151,6 @@ def get_llm_model(provider: str, **kwargs): else: base_url = kwargs.get("base_url") api_version = kwargs.get("api_version", "") or os.getenv("AZURE_OPENAI_API_VERSION", "2025-01-01-preview") - # Create rate limiter - rate_limiter = InMemoryRateLimiter( - requests_per_second=rate_limit_rps, - check_every_n_seconds=0.1, - max_bucket_size=rate_limit_bucket - ) return AzureChatOpenAI( model=kwargs.get("model_name", "gpt-4o"), temperature=kwargs.get("temperature", 0.0),