From 23a9295c7f64d99cf3bdc2cd96d2ffdc30016911 Mon Sep 17 00:00:00 2001
From: engkimo <dailyrandor@gmail.com>
Date: Tue, 19 May 2026 12:19:13 +0900
Subject: [PATCH] fix(plan-evaluator): unwrap arbitrarily-nested JSON list
 payloads

Live A/B run (haiku_planner_ab_2026_05_19) observed the judge model
returning `[[{...}]]` 3 times. The single-level guard only popped one
layer, then `data.get` blew up on the inner list and the call fell
back to (0.5, 0.5, 1.0). Replace `if` with `while` so any depth is
unwrapped, and raise on non-dict so the existing exception handler
takes the fallback path explicitly.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 infrastructure/fractal/llm_plan_evaluator.py  |  5 +++-
 .../infrastructure/test_llm_plan_evaluator.py | 24 +++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/infrastructure/fractal/llm_plan_evaluator.py b/infrastructure/fractal/llm_plan_evaluator.py
index bd92f0a..7de7e02 100644
--- a/infrastructure/fractal/llm_plan_evaluator.py
+++ b/infrastructure/fractal/llm_plan_evaluator.py
@@ -172,9 +172,12 @@ def _parse_evaluation(
             else:
                 raise
 
-        if isinstance(data, list) and data:
+        while isinstance(data, list) and data:
             data = data[0]
 
+        if not isinstance(data, dict):
+            raise ValueError(f"Expected JSON object, got {type(data).__name__}")
+
         completeness = _clamp(float(data.get("completeness", 0.5)))
         feasibility = _clamp(float(data.get("feasibility", 0.5)))
         safety = _clamp(float(data.get("safety", 1.0)))
diff --git a/tests/unit/infrastructure/test_llm_plan_evaluator.py b/tests/unit/infrastructure/test_llm_plan_evaluator.py
index c2e6c2d..071b7c6 100644
--- a/tests/unit/infrastructure/test_llm_plan_evaluator.py
+++ b/tests/unit/infrastructure/test_llm_plan_evaluator.py
@@ -359,3 +359,27 @@ async def test_json_with_think_tags(self, llm: AsyncMock, evaluator: LLMPlanEval
         result = await evaluator.evaluate(plan, "Goal")
 
         assert result.completeness == pytest.approx(0.7, abs=0.01)
+
+    @pytest.mark.asyncio
+    async def test_nested_list_payload_unwrapped(
+        self, llm: AsyncMock, evaluator: LLMPlanEvaluator
+    ) -> None:
+        """Regression: live A/B run observed `[[{...}]]` from the judge model.
+
+        The single-level guard was not enough — the second `data.get` blew up
+        and the call fell back to (0.5, 0.5, 1.0). Should unwrap any depth.
+        """
+        nested = (
+            "[[{"
+            '"completeness": 0.9, "feasibility": 0.8, "safety": 0.95, '
+            '"feedback": "nested"}]]'
+        )
+        llm.complete.return_value = _llm_response(nested)
+        plan = _sample_plan()
+
+        result = await evaluator.evaluate(plan, "Goal")
+
+        assert result.completeness == pytest.approx(0.9, abs=0.01)
+        assert result.feasibility == pytest.approx(0.8, abs=0.01)
+        assert result.safety == pytest.approx(0.95, abs=0.01)
+        assert "Fallback" not in result.feedback