Prioritize Bash if LLM responses both Bash and Conclusion (#716)

Previously, when the LLM response included both bash commands and a conclusion, we ignored the bash command and compiled the fuzz target in the conclusion alone. This approach is suboptimal, as conclusions may be premature, and the bash command often provides useful context to refine them. To improve accuracy, we should prioritize the bash command when both are present in a response. A more robust solution would involve structuring tasks to prevent mixed responses (e.g., using task-focused agents). But until then, prioritizing bash commands in mixed responses provides a fallback/default solution.
google · Nov 11, 2024 · 6ae19df · 6ae19df
1 parent e29ab08
commit 6ae19df
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 13 deletions.
diff --git a/agent/base_agent.py b/agent/base_agent.py
@@ -68,6 +68,7 @@ def _filter_code(self, raw_code_block: str) -> str:
     return filtered_code_block
 
   def _format_bash_execution_result(self, process: sp.CompletedProcess) -> str:
+    """Formats a prompt based on bash execution result."""
     stdout = self.llm.truncate_prompt(process.stdout)
     # TODO(dongge) Share input limit evenly if both stdout and stderr overlong.
     stderr = self.llm.truncate_prompt(process.stderr, stdout)
@@ -76,18 +77,16 @@ def _format_bash_execution_result(self, process: sp.CompletedProcess) -> str:
             f'<stdout>\n{stdout}\n</stdout>\n'
             f'<stderr>\n{stderr}\n</stderr>\n')
 
-  def _container_handle_bash_command(self, cur_round: int, response: str,
+  def _container_handle_bash_command(self, command: str,
                                      tool: BaseTool) -> Prompt:
-    """Handles the command from LLM with container tool."""
-    command = self._parse_tag(response, 'bash')
-    if command:
-      prompt_text = self._format_bash_execution_result(tool.execute(command))
-    else:
-      logger.warning('ROUND %02d No BASH command from LLM response: %s',
-                     cur_round, response)
-      prompt_text = ('No bash command received, Please follow the '
-                     'interaction protocols:\n'
-                     f'{tool.tutorial()}')
+    """Handles the command from LLM with container |tool|."""
+    prompt_text = self._format_bash_execution_result(tool.execute(command))
+    return DefaultTemplateBuilder(self.llm, None, initial=prompt_text).build([])
+
+  def _container_handle_invalid_tool_usage(self, tool: BaseTool) -> Prompt:
+    """Formats a prompt to re-teach LLM how to use the |tool|."""
+    prompt_text = (f'No valid instruction received, Please follow the '
+                   f'interaction protocols:\n{tool.tutorial()}')
     return DefaultTemplateBuilder(self.llm, None, initial=prompt_text).build([])
 
   def _sleep_random_duration(self, min_sec: int = 1, max_sec: int = 60) -> None:

diff --git a/agent/prototyper.py b/agent/prototyper.py
@@ -221,11 +221,17 @@ def _container_handle_conclusion(
   def _container_tool_reaction(self, cur_round: int, response: str,
                                build_result: BuildResult) -> Optional[Prompt]:
     """Validates LLM conclusion or executes its command."""
+    # Prioritize Bash instructions.
+    if command := self._parse_tag(response, 'bash'):
+      return self._container_handle_bash_command(command, self.inspect_tool)
+
     if self._parse_tag(response, 'conclusion'):
       return self._container_handle_conclusion(cur_round, response,
                                                build_result)
-    return self._container_handle_bash_command(cur_round, response,
-                                               self.inspect_tool)
+    # Other responses are invalid.
+    logger.warning('ROUND %02d Invalid response from LLM: %s', cur_round,
+                   response)
+    return self._container_handle_invalid_tool_usage(self.inspect_tool)
 
   def execute(self, result_history: list[Result]) -> BuildResult:
     """Executes the agent based on previous result."""