fix: improve contextual chunk headings (#118)

SimonJasansky · web-flow · commit 024dbf4f428f · 2025-05-05T16:09:14.000+02:00
diff --git a/Dockerfile b/Dockerfile
@@ -26,4 +26,8 @@ RUN mkdir ~/.history/ && \
     echo 'HISTFILE=~/.history/.bash_history' >> ~/.bashrc && \
     echo 'bind "\"\e[A\": history-search-backward"' >> ~/.bashrc && \
     echo 'bind "\"\e[B\": history-search-forward"' >> ~/.bashrc && \
-    echo 'eval "$(starship init bash)"' >> ~/.bashrc
+    echo 'eval "$(starship init bash)"' >> ~/.bashrc
+
+# Explicitly configure compilers for llama-cpp-python.
+ENV CMAKE_C_COMPILER=/usr/bin/gcc
+ENV CMAKE_CXX_COMPILER=/usr/bin/g++
diff --git a/pyproject.toml b/pyproject.toml
@@ -44,7 +44,7 @@ dependencies = [
   # CLI:
   "typer (>=0.15.1)",
   # Model Context Protocol:
-  "fastmcp (>=0.4.1)",
+  "fastmcp (>=2.0.0)",
   # Utilities:
   "packaging (>=23.0)",
 ]
@@ -82,7 +82,7 @@ dev = [
 # Frontend:
 chainlit = ["chainlit (>=2.0.0)"]
 # Large Language Models:
-llama-cpp-python = ["llama-cpp-python (>=0.3.3)"]
+llama-cpp-python = ["llama-cpp-python (>=0.3.4)"]
 # Markdown conversion:
 pandoc = ["pypandoc-binary (>=1.13)"]
 # Evaluation:
diff --git a/src/raglite/_chatml_function_calling.py b/src/raglite/_chatml_function_calling.py
@@ -290,7 +290,7 @@ def chatml_function_calling_with_streaming(
         # Assistant message
         "{% if message.role == 'assistant' %}"
         ## Regular message
-        "{% if message.content and message.content | length > 0 %}"
+        "{% if 'content' in message and message.content %}"
         "{% if tool_calls %}"
         "message:\n"
         "{% endif %}"
@@ -310,7 +310,6 @@ def chatml_function_calling_with_streaming(
         "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
     )
     template_renderer = ImmutableSandboxedEnvironment(
-        autoescape=jinja2.select_autoescape(["html", "xml"]),
         undefined=jinja2.StrictUndefined,
     ).from_string(function_calling_template)
 
diff --git a/src/raglite/_database.py b/src/raglite/_database.py
@@ -119,26 +119,44 @@ def from_body(
             id=hash_bytes(f"{document_id}-{index}".encode()),
             document_id=document_id,
             index=index,
-            headings=headings,
+            headings=Chunk.truncate_headings(headings, body),
             body=body,
             metadata_=kwargs,
         )
 
-    def extract_headings(self) -> str:
-        """Extract Markdown headings from the chunk, starting from the current Markdown headings."""
+    @staticmethod
+    def extract_heading_lines(doc: str, leading_only: bool = False) -> list[str]:  # noqa: FBT001,FBT002
+        """Extract the leading or final state of the Markdown headings of a document."""
         md = MarkdownIt()
-        heading_lines = [""] * 10
+        heading_lines = [""] * 6
         level = None
-        for doc in (self.headings, self.body):
-            for token in md.parse(doc):
-                if token.type == "heading_open":
-                    level = int(token.tag[1])
-                elif token.type == "heading_close":
-                    level = None
-                elif level is not None:
-                    heading_content = token.content.strip().replace("\n", " ")
-                    heading_lines[level] = ("#" * level) + " " + heading_content
-                    heading_lines[level + 1 :] = [""] * len(heading_lines[level + 1 :])
+        for token in md.parse(doc):
+            if token.type == "heading_open":
+                level = int(token.tag[1]) if 1 <= int(token.tag[1]) <= 6 else None  # noqa: PLR2004
+            elif token.type == "heading_close":
+                level = None
+            elif level is not None:
+                heading_content = token.content.strip().replace("\n", " ")
+                heading_lines[level - 1] = ("#" * level) + " " + heading_content
+                heading_lines[level:] = [""] * len(heading_lines[level + 1 :])
+            elif leading_only and level is None and token.content and not token.content.isspace():
+                break
+        return heading_lines
+
+    @staticmethod
+    def truncate_headings(headings: str, body: str) -> str:
+        """Truncate the contextual headings given the chunk's leading headings (if present)."""
+        heading_lines = Chunk.extract_heading_lines(headings)
+        leading_body_heading_lines = Chunk.extract_heading_lines(body, leading_only=True)
+        level = next((i + 1 for i, line in enumerate(leading_body_heading_lines) if line), None)
+        if level:
+            heading_lines[level - 1 :] = [""] * len(heading_lines[level - 1 :])
+        headings = "\n".join([heading for heading in heading_lines if heading])
+        return headings
+
+    def extract_headings(self) -> str:
+        """Extract Markdown headings from the chunk, starting from the contextual headings."""
+        heading_lines = self.extract_heading_lines(self.headings + "\n\n" + self.body)
         headings = "\n".join([heading for heading in heading_lines if heading])
         return headings
 
diff --git a/src/raglite/_mcp.py b/src/raglite/_mcp.py
@@ -1,6 +1,6 @@
 """MCP server for RAGLite."""
 
-from typing import Annotated
+from typing import Annotated, Any
 
 from fastmcp import FastMCP
 from pydantic import Field
@@ -20,9 +20,9 @@
 ]
 
 
-def create_mcp_server(server_name: str, *, config: RAGLiteConfig) -> FastMCP:
+def create_mcp_server(server_name: str, *, config: RAGLiteConfig) -> FastMCP[Any]:
     """Create a RAGLite MCP server."""
-    mcp = FastMCP(server_name)
+    mcp: FastMCP[Any] = FastMCP(server_name)
 
     @mcp.prompt()
     def kb(query: Query) -> str:
diff --git a/tests/test_insert.py b/tests/test_insert.py
@@ -0,0 +1,55 @@
+"""Tests for the _insert module."""
+
+from pathlib import Path
+
+from sqlmodel import Session, select
+from tqdm import tqdm
+
+from raglite._config import RAGLiteConfig
+from raglite._database import Chunk, Document, create_database_engine
+from raglite._markdown import document_to_markdown
+
+
+def test_insert(raglite_test_config: RAGLiteConfig) -> None:
+    """Test the insert function by testing logic on chunks in raglite_test_config database."""
+    # Get access to the database from the raglite_test_config
+    engine = create_database_engine(raglite_test_config)
+
+    # Open a session to extract document and chunks from the existing database
+    with Session(engine) as session:
+        # Get the first document from the database (already inserted by the fixture)
+        document = session.exec(select(Document)).first()
+        assert document is not None, "No document found in the database"
+
+        # Get the existing chunks for this document
+        chunks = session.exec(
+            select(Chunk).where(Chunk.document_id == document.id).order_by(Chunk.index)  # type: ignore[arg-type]
+        ).all()
+        assert len(chunks) > 0, "No chunks found for the document"
+        restored_document = ""
+        for chunk in tqdm(chunks, desc="Processing chunks"):
+            # body should not contain the heading string (except if heading is empty)
+            if chunk.headings.strip() != "":
+                assert chunk.headings.strip() not in chunk.body.strip(), (
+                    f"Chunk body contains heading: '{chunk.headings.strip()}'\n"
+                    f"Chunk body: '{chunk.body.strip()}'"
+                )
+
+            # Body that starts with a # should not have a heading
+            if chunk.body.strip().startswith("# "):
+                assert chunk.headings.strip() == "", (
+                    f"Chunk body starts with a heading: '{chunk.body.strip()}'\n"
+                    f"Chunk headings: '{chunk.headings.strip()}'"
+                )
+
+            restored_document += chunk.body
+
+        # combining the chunks should yield the original document
+        restored_document = "".join(restored_document)
+        restored_document = restored_document.replace("\n", "").strip()
+
+        doc_path = Path(__file__).parent / "specrel.pdf"  # Einstein's special relativity paper.
+        doc = document_to_markdown(doc_path)
+        doc = doc.replace("\n", "").strip()
+
+        assert restored_document == doc, "Restored document does not match the original input."
diff --git a/tests/test_search.py b/tests/test_search.py
@@ -42,7 +42,10 @@ def test_search(raglite_test_config: RAGLiteConfig, search_method: SearchMethod)
     chunks = retrieve_chunks(chunk_ids, config=raglite_test_config)
     assert all(isinstance(chunk, Chunk) for chunk in chunks)
     assert all(chunk_id == chunk.id for chunk_id, chunk in zip(chunk_ids, chunks, strict=True))
-    assert any("Definition of Simultaneity" in str(chunk) for chunk in chunks)
+    assert any("Definition of Simultaneity" in str(chunk) for chunk in chunks), (
+        "Expected 'Definition of Simultaneity' in chunks but got:\n"
+        + "\n".join(f"- Chunk {i + 1}:\n{chunk!s}\n{'-' * 80}" for i, chunk in enumerate(chunks))
+    )
     assert all(isinstance(chunk.document, Document) for chunk in chunks)
     # Extend the chunks with their neighbours and group them into contiguous segments.
     chunk_spans = retrieve_chunk_spans(chunk_ids, neighbors=(-1, 1), config=raglite_test_config)