engkimo · engkimo · May 19, 2026 · May 19, 2026
diff --git a/infrastructure/fractal/llm_plan_evaluator.py b/infrastructure/fractal/llm_plan_evaluator.py
@@ -172,9 +172,12 @@ def _parse_evaluation(
             else:
                 raise
 
-        if isinstance(data, list) and data:
+        while isinstance(data, list) and data:
             data = data[0]
 
+        if not isinstance(data, dict):
+            raise ValueError(f"Expected JSON object, got {type(data).__name__}")
+
         completeness = _clamp(float(data.get("completeness", 0.5)))
         feasibility = _clamp(float(data.get("feasibility", 0.5)))
         safety = _clamp(float(data.get("safety", 1.0)))

diff --git a/tests/unit/infrastructure/test_llm_plan_evaluator.py b/tests/unit/infrastructure/test_llm_plan_evaluator.py
@@ -359,3 +359,27 @@ async def test_json_with_think_tags(self, llm: AsyncMock, evaluator: LLMPlanEval
         result = await evaluator.evaluate(plan, "Goal")
 
         assert result.completeness == pytest.approx(0.7, abs=0.01)
+
+    @pytest.mark.asyncio
+    async def test_nested_list_payload_unwrapped(
+        self, llm: AsyncMock, evaluator: LLMPlanEvaluator
+    ) -> None:
+        """Regression: live A/B run observed `[[{...}]]` from the judge model.
+
+        The single-level guard was not enough — the second `data.get` blew up
+        and the call fell back to (0.5, 0.5, 1.0). Should unwrap any depth.
+        """
+        nested = (
+            "[[{"
+            '"completeness": 0.9, "feasibility": 0.8, "safety": 0.95, '
+            '"feedback": "nested"}]]'
+        )
+        llm.complete.return_value = _llm_response(nested)
+        plan = _sample_plan()
+
+        result = await evaluator.evaluate(plan, "Goal")
+
+        assert result.completeness == pytest.approx(0.9, abs=0.01)
+        assert result.feasibility == pytest.approx(0.8, abs=0.01)
+        assert result.safety == pytest.approx(0.95, abs=0.01)
+        assert "Fallback" not in result.feedback