From 23a9295c7f64d99cf3bdc2cd96d2ffdc30016911 Mon Sep 17 00:00:00 2001 From: engkimo Date: Tue, 19 May 2026 12:19:13 +0900 Subject: [PATCH] fix(plan-evaluator): unwrap arbitrarily-nested JSON list payloads Live A/B run (haiku_planner_ab_2026_05_19) observed the judge model returning `[[{...}]]` 3 times. The single-level guard only popped one layer, then `data.get` blew up on the inner list and the call fell back to (0.5, 0.5, 1.0). Replace `if` with `while` so any depth is unwrapped, and raise on non-dict so the existing exception handler takes the fallback path explicitly. Co-Authored-By: Claude Opus 4.7 --- infrastructure/fractal/llm_plan_evaluator.py | 5 +++- .../infrastructure/test_llm_plan_evaluator.py | 24 +++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/infrastructure/fractal/llm_plan_evaluator.py b/infrastructure/fractal/llm_plan_evaluator.py index bd92f0a..7de7e02 100644 --- a/infrastructure/fractal/llm_plan_evaluator.py +++ b/infrastructure/fractal/llm_plan_evaluator.py @@ -172,9 +172,12 @@ def _parse_evaluation( else: raise - if isinstance(data, list) and data: + while isinstance(data, list) and data: data = data[0] + if not isinstance(data, dict): + raise ValueError(f"Expected JSON object, got {type(data).__name__}") + completeness = _clamp(float(data.get("completeness", 0.5))) feasibility = _clamp(float(data.get("feasibility", 0.5))) safety = _clamp(float(data.get("safety", 1.0))) diff --git a/tests/unit/infrastructure/test_llm_plan_evaluator.py b/tests/unit/infrastructure/test_llm_plan_evaluator.py index c2e6c2d..071b7c6 100644 --- a/tests/unit/infrastructure/test_llm_plan_evaluator.py +++ b/tests/unit/infrastructure/test_llm_plan_evaluator.py @@ -359,3 +359,27 @@ async def test_json_with_think_tags(self, llm: AsyncMock, evaluator: LLMPlanEval result = await evaluator.evaluate(plan, "Goal") assert result.completeness == pytest.approx(0.7, abs=0.01) + + @pytest.mark.asyncio + async def test_nested_list_payload_unwrapped( + self, llm: AsyncMock, evaluator: LLMPlanEvaluator + ) -> None: + """Regression: live A/B run observed `[[{...}]]` from the judge model. + + The single-level guard was not enough — the second `data.get` blew up + and the call fell back to (0.5, 0.5, 1.0). Should unwrap any depth. + """ + nested = ( + "[[{" + '"completeness": 0.9, "feasibility": 0.8, "safety": 0.95, ' + '"feedback": "nested"}]]' + ) + llm.complete.return_value = _llm_response(nested) + plan = _sample_plan() + + result = await evaluator.evaluate(plan, "Goal") + + assert result.completeness == pytest.approx(0.9, abs=0.01) + assert result.feasibility == pytest.approx(0.8, abs=0.01) + assert result.safety == pytest.approx(0.95, abs=0.01) + assert "Fallback" not in result.feedback