Skip to content

Commit cd4fcd9

Browse files
committed
schema extract clean json
1 parent 7b4e2d3 commit cd4fcd9

File tree

2 files changed

+93
-0
lines changed

2 files changed

+93
-0
lines changed

src/neo4j_graphrag/experimental/components/schema.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from __future__ import annotations
1616

1717
import json
18+
import re
1819

1920
import neo4j
2021
import logging
@@ -554,6 +555,15 @@ def _filter_relationships_without_labels(
554555
relationship_types, "relationship type"
555556
)
556557

558+
def _clean_json_content(self, content: str) -> str:
559+
content = content.strip()
560+
561+
# Remove markdown code block markers if present
562+
content = re.sub(r'^```(?:json)?\s*', '', content, flags=re.MULTILINE)
563+
content = re.sub(r'```\s*$', '', content, flags=re.MULTILINE)
564+
565+
return content.strip()
566+
557567
@validate_call
558568
async def run(self, text: str, examples: str = "", **kwargs: Any) -> GraphSchema:
559569
"""
@@ -575,6 +585,9 @@ async def run(self, text: str, examples: str = "", **kwargs: Any) -> GraphSchema
575585
# Re-raise the LLMGenerationError
576586
raise LLMGenerationError("Failed to generate schema from text") from e
577587

588+
# Clean response
589+
content = self._clean_json_content(content)
590+
578591
try:
579592
extracted_schema: Dict[str, Any] = json.loads(content)
580593

tests/unit/experimental/components/test_schema.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -960,6 +960,86 @@ async def test_schema_from_text_filters_relationships_without_labels(
960960
assert ("Person", "MANAGES", "Organization") in schema.patterns
961961

962962

963+
@pytest.fixture
964+
def valid_schema_json_with_markdown() -> str:
965+
return """```json
966+
{
967+
"node_types": [
968+
{
969+
"label": "Person",
970+
"properties": [
971+
{"name": "name", "type": "STRING"}
972+
]
973+
},
974+
{
975+
"label": "Organization",
976+
"properties": [
977+
{"name": "name", "type": "STRING"}
978+
]
979+
}
980+
],
981+
"relationship_types": [
982+
{
983+
"label": "WORKS_FOR",
984+
"properties": [
985+
{"name": "since", "type": "DATE"}
986+
]
987+
}
988+
],
989+
"patterns": [
990+
["Person", "WORKS_FOR", "Organization"]
991+
]
992+
}
993+
```"""
994+
995+
996+
@pytest.fixture
997+
def valid_schema_json_with_markdown_no_language() -> str:
998+
return """```
999+
{
1000+
"node_types": [
1001+
{
1002+
"label": "Person",
1003+
"properties": [
1004+
{"name": "name", "type": "STRING"}
1005+
]
1006+
}
1007+
]
1008+
}
1009+
```"""
1010+
1011+
1012+
def test_clean_json_content_markdown_with_json_language(
1013+
schema_from_text: SchemaFromTextExtractor,
1014+
) -> None:
1015+
content = """```json
1016+
{"node_types": [{"label": "Person"}]}
1017+
```"""
1018+
1019+
cleaned = schema_from_text._clean_json_content(content)
1020+
assert cleaned == '{"node_types": [{"label": "Person"}]}'
1021+
1022+
1023+
def test_clean_json_content_markdown_without_language(
1024+
schema_from_text: SchemaFromTextExtractor,
1025+
) -> None:
1026+
content = """```
1027+
{"node_types": [{"label": "Person"}]}
1028+
```"""
1029+
1030+
cleaned = schema_from_text._clean_json_content(content)
1031+
assert cleaned == '{"node_types": [{"label": "Person"}]}'
1032+
1033+
1034+
def test_clean_json_content_plain_json(
1035+
schema_from_text: SchemaFromTextExtractor,
1036+
) -> None:
1037+
content = '{"node_types": [{"label": "Person"}]}'
1038+
1039+
cleaned = schema_from_text._clean_json_content(content)
1040+
assert cleaned == '{"node_types": [{"label": "Person"}]}'
1041+
1042+
9631043
@pytest.mark.asyncio
9641044
@patch("neo4j_graphrag.experimental.components.schema.get_structured_schema")
9651045
async def test_schema_from_existing_graph(mock_get_structured_schema: Mock) -> None:

0 commit comments

Comments
 (0)