Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion infrastructure/fractal/llm_plan_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,12 @@ def _parse_evaluation(
else:
raise

if isinstance(data, list) and data:
while isinstance(data, list) and data:
data = data[0]

if not isinstance(data, dict):
raise ValueError(f"Expected JSON object, got {type(data).__name__}")

completeness = _clamp(float(data.get("completeness", 0.5)))
feasibility = _clamp(float(data.get("feasibility", 0.5)))
safety = _clamp(float(data.get("safety", 1.0)))
Expand Down
24 changes: 24 additions & 0 deletions tests/unit/infrastructure/test_llm_plan_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,3 +359,27 @@ async def test_json_with_think_tags(self, llm: AsyncMock, evaluator: LLMPlanEval
result = await evaluator.evaluate(plan, "Goal")

assert result.completeness == pytest.approx(0.7, abs=0.01)

@pytest.mark.asyncio
async def test_nested_list_payload_unwrapped(
self, llm: AsyncMock, evaluator: LLMPlanEvaluator
) -> None:
"""Regression: live A/B run observed `[[{...}]]` from the judge model.

The single-level guard was not enough — the second `data.get` blew up
and the call fell back to (0.5, 0.5, 1.0). Should unwrap any depth.
"""
nested = (
"[[{"
'"completeness": 0.9, "feasibility": 0.8, "safety": 0.95, '
'"feedback": "nested"}]]'
)
llm.complete.return_value = _llm_response(nested)
plan = _sample_plan()

result = await evaluator.evaluate(plan, "Goal")

assert result.completeness == pytest.approx(0.9, abs=0.01)
assert result.feasibility == pytest.approx(0.8, abs=0.01)
assert result.safety == pytest.approx(0.95, abs=0.01)
assert "Fallback" not in result.feedback
Loading