Skip to content

Now get's list of models from ollama instead of hardcoding them. also Improve Browser navigation and error handling #331

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 87 additions & 19 deletions src/agent/custom_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ def update_step_info(

logger.info(f"🧠 All Memory: \n{step_info.memory}")


@time_execution_async("--get_next_action")
async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput:
"""Get next action from LLM based on current state"""
Expand All @@ -232,19 +233,80 @@ async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutpu
logger.info(ai_message.reasoning_content)
logger.info("🤯 End Deep Thinking")

if isinstance(ai_message.content, list):
ai_content = ai_message.content[0]
else:
ai_content = ai_message.content

ai_content = ai_content.replace("```json", "").replace("```", "")
ai_content = repair_json(ai_content)
parsed_json = json.loads(ai_content)
parsed: AgentOutput = self.AgentOutput(**parsed_json)

if parsed is None:
logger.debug(ai_message.content)
raise ValueError('Could not parse response.')
try:
if isinstance(ai_message.content, list):
ai_content = ai_message.content[0]
else:
ai_content = ai_message.content

# Add this debug print
print("RAW AI CONTENT:", ai_content)

# Enhanced JSON parsing
if "```json" in ai_content or "```" in ai_content:
# Extract JSON from code blocks
ai_content = re.sub(r'```(?:json)?(.*?)```', r'\1', ai_content, flags=re.DOTALL)

# Try to repair the JSON
try:
ai_content = repair_json(ai_content)
except Exception as json_repair_error:
logger.warning(f"JSON repair failed: {json_repair_error}")
# Try more aggressive cleaning
ai_content = re.sub(r'[^{}[\],:"\d\w\s.-]', '', ai_content)

try:
parsed_json = json.loads(ai_content)
if 'action' in parsed_json:
for action in parsed_json['action']:
if isinstance(action, dict) and 'done' in action and isinstance(action['done'], dict) and 'text' in action['done']:
# If text is a dict with type/value structure, extract the value
if isinstance(action['done']['text'], dict) and 'value' in action['done']['text']:
action['done']['text'] = action['done']['text']['value']
# If text is any other non-string dict, convert to string
elif isinstance(action['done']['text'], dict):
action['done']['text'] = str(action['done']['text'])
parsed: AgentOutput = self.AgentOutput(**parsed_json)
except json.JSONDecodeError as e:
# Create a minimal valid structure if parsing fails
logger.warning("JSON parsing failed, creating minimal structure")
parsed_json = {
"current_state": {
"prev_action_evaluation": "Failed - Unable to parse model output",
"important_contents": "",
"task_progress": "",
"future_plans": "Retry with simpler action",
"thought": "The model output was malformed. I need to retry with a simpler action.",
"summary": "Retrying with simpler action"
},
"action": [{"extract_page_content": {}}] # Safe fallback action
}

parsed: AgentOutput = self.AgentOutput(**parsed_json)
except Exception as e:
logger.error(f"Error processing model output: {e}")
# Create a minimal fallback output
minimal_json = {
"current_state": {
"prev_action_evaluation": "Failed - Unable to process model output",
"important_contents": "",
"task_progress": "",
"future_plans": "Retry with simpler action",
"thought": "There was an error processing the model output. I'll take a safe action.",
"summary": "Handling error gracefully"
},
"action": [{"extract_page_content": {}}] # Safe fallback action
}
parsed = self.AgentOutput(**minimal_json)

# Continue with existing code...
if len(parsed.action) > 0:
first_action = parsed.action[0]
if hasattr(first_action, 'go_to_url'):
logger.info("Navigation action detected - executing in isolation")
parsed.action = [first_action]
elif hasattr(first_action, 'done'):
parsed.action = [first_action]

# Limit actions to maximum allowed per step
parsed.action = parsed.action[: self.max_actions_per_step]
Expand Down Expand Up @@ -310,6 +372,9 @@ async def _run_planner(self) -> Optional[str]:
@time_execution_async("--step")
async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
"""Execute one step of the task"""
if not self.browser_context:
raise RuntimeError("Browser context not initialized")

logger.info(f"\n📍 Step {self.n_steps}")
state = None
model_output = None
Expand Down Expand Up @@ -352,14 +417,17 @@ async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
check_break_if_paused=lambda: self._check_if_stopped_or_paused(),
available_file_paths=self.available_file_paths,
)
if len(result) != len(actions):
# I think something changes, such information should let LLM know
if len(result) != len(actions) and len(actions) > 0:
# Add safety check for result list
base_action_index = len(result) - 1 if len(result) > 0 else 0
for ri in range(len(result), len(actions)):
error_msg = f"{actions[ri].model_dump_json(exclude_unset=True)} is Failed to execute."
if len(result) > 0:
error_msg += f" Something new appeared after action {actions[base_action_index].model_dump_json(exclude_unset=True)}"
result.append(ActionResult(extracted_content=None,
include_in_memory=True,
error=f"{actions[ri].model_dump_json(exclude_unset=True)} is Failed to execute. \
Something new appeared after action {actions[len(result) - 1].model_dump_json(exclude_unset=True)}",
is_done=False))
include_in_memory=True,
error=error_msg,
is_done=False))
for ret_ in result:
if ret_.extracted_content and "Extracted page" in ret_.extracted_content:
# record every extracted page
Expand Down
2 changes: 1 addition & 1 deletion src/agent/custom_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class CustomAgentBrain(BaseModel):
task_progress: str
future_plans: str
thought: str
summary: str
summary: str=""


class CustomAgentOutput(AgentOutput):
Expand Down
2 changes: 1 addition & 1 deletion src/utils/default_config_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def default_config():
"tool_calling_method": "auto",
"llm_provider": "openai",
"llm_model_name": "gpt-4o",
"llm_num_ctx": 32000,
"llm_num_ctx": 16000,
"llm_temperature": 1.0,
"llm_base_url": "",
"llm_api_key": "",
Expand Down
Loading