diff --git a/DocTest/DocumentRepresentation.py b/DocTest/DocumentRepresentation.py
index b4ab734..9ed97e3 100644
--- a/DocTest/DocumentRepresentation.py
+++ b/DocTest/DocumentRepresentation.py
@@ -17,6 +17,7 @@
     PageStructure,
     StructureExtractionConfig,
     build_page_structure,
+    build_page_structure_from_words,
 )
 from DocTest.config import DEFAULT_DPI, OCR_ENGINE_DEFAULT, DEFAULT_CONFIDENCE, MINIMUM_OCR_RESOLUTION, ADD_PIXELS_TO_IGNORE_AREA, TESSERACT_CONFIG
 import tempfile
@@ -197,13 +198,27 @@ def get_pdf_structure(self, config: Optional[StructureExtractionConfig] = None)
         cached = self._structure_cache.get(config)
         if cached:
             return cached
-        structure = build_page_structure(
-            page_number=self.page_number,
-            pdf_dict=self.pdf_text_dict,
-            config=config,
-            dpi=self.dpi,
-            image_shape=self.image.shape,
-        )
+        if config.spatial_word_sorting and self.pdf_text_words:
+            # Derive page dimensions from the dict if available.
+            pw = float(self.pdf_text_dict.get("width", 0)) if self.pdf_text_dict else 0.0
+            ph = float(self.pdf_text_dict.get("height", 0)) if self.pdf_text_dict else 0.0
+            structure = build_page_structure_from_words(
+                page_number=self.page_number,
+                pdf_text_words=self.pdf_text_words,
+                config=config,
+                page_width=pw,
+                page_height=ph,
+                dpi=self.dpi,
+                image_shape=self.image.shape,
+            )
+        else:
+            structure = build_page_structure(
+                page_number=self.page_number,
+                pdf_dict=self.pdf_text_dict,
+                config=config,
+                dpi=self.dpi,
+                image_shape=self.image.shape,
+            )
         self._structure_cache[config] = structure
         return structure
 
diff --git a/DocTest/HeaderFooterDetector.py b/DocTest/HeaderFooterDetector.py
new file mode 100644
index 0000000..b31534f
--- /dev/null
+++ b/DocTest/HeaderFooterDetector.py
@@ -0,0 +1,230 @@
+"""Repetition-based header/footer detection for PDF structure comparison.
+
+Scans configurable vertical regions at the top/bottom of each page, identifies
+text lines that repeat across multiple pages (with digit normalization for page
+numbers), and removes them from the DocumentStructure before comparison.
+
+This module is a pure-function domain service with no side effects, no Robot
+Framework dependency, and no I/O.
+"""
+
+from __future__ import annotations
+
+import re
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Dict, FrozenSet, List, Set
+
+from DocTest.PdfStructureModels import (
+    DocumentStructure,
+    PageStructure,
+    TextBlock,
+    TextLine,
+)
+
+__all__ = [
+    "HeaderFooterConfig",
+    "DetectionResult",
+    "detect_repeating_headers_footers",
+    "strip_detected_headers_footers",
+    "filter_headers_footers",
+]
+
+_DIGIT_RUN_RE = re.compile(r"\d+")
+
+
+@dataclass(frozen=True)
+class HeaderFooterConfig:
+    """Configuration for repetition-based header/footer detection."""
+
+    header_scan_height: float = 0.0
+    footer_scan_height: float = 0.0
+    repeat_threshold: int = 2
+
+    @property
+    def enabled(self) -> bool:
+        """Return True if at least one scan region is configured."""
+        return self.header_scan_height > 0 or self.footer_scan_height > 0
+
+
+@dataclass(frozen=True)
+class DetectionResult:
+    """Immutable record of which normalized keys were detected as headers/footers."""
+
+    header_keys: FrozenSet[str]
+    footer_keys: FrozenSet[str]
+
+    @property
+    def has_detections(self) -> bool:
+        return bool(self.header_keys or self.footer_keys)
+
+
+def _normalize_for_grouping(text: str) -> str:
+    """Replace all digit runs with '#' so page-number variants group together.
+
+    Examples:
+        "Page 1 of 5"  -> "Page # of #"
+        "ACME Corp"    -> "ACME Corp"   (no digits, unchanged)
+        "- 3 -"        -> "- # -"
+    """
+    return _DIGIT_RUN_RE.sub("#", text)
+
+
+def detect_repeating_headers_footers(
+    structure: DocumentStructure,
+    config: HeaderFooterConfig,
+) -> DetectionResult:
+    """Scan a DocumentStructure and identify text that repeats across pages
+    in the header/footer regions.
+
+    Args:
+        structure: The document to scan.
+        config: Detection parameters (scan heights and threshold).
+
+    Returns:
+        A DetectionResult containing the normalized keys of detected
+        header and footer lines.
+    """
+    if not config.enabled:
+        return DetectionResult(header_keys=frozenset(), footer_keys=frozenset())
+
+    header_candidates: Dict[str, Set[int]] = defaultdict(set)
+    footer_candidates: Dict[str, Set[int]] = defaultdict(set)
+
+    for page in structure.pages:
+        footer_boundary = page.height - config.footer_scan_height
+
+        for block in page.blocks:
+            for line in block.lines:
+                text = line.text or ""
+                if not text:
+                    continue
+                key = _normalize_for_grouping(text)
+
+                # Check header region
+                if config.header_scan_height > 0 and line.bbox[1] < config.header_scan_height:
+                    header_candidates[key].add(page.page_number)
+
+                # Check footer region
+                if config.footer_scan_height > 0 and line.bbox[3] > footer_boundary:
+                    footer_candidates[key].add(page.page_number)
+
+    detected_header_keys = frozenset(
+        key for key, pages in header_candidates.items() if len(pages) >= config.repeat_threshold
+    )
+    detected_footer_keys = frozenset(
+        key for key, pages in footer_candidates.items() if len(pages) >= config.repeat_threshold
+    )
+
+    return DetectionResult(
+        header_keys=detected_header_keys,
+        footer_keys=detected_footer_keys,
+    )
+
+
+def strip_detected_headers_footers(
+    structure: DocumentStructure,
+    detection: DetectionResult,
+    config: HeaderFooterConfig,
+) -> DocumentStructure:
+    """Remove detected header/footer lines from a DocumentStructure.
+
+    Only lines that (a) match a detected normalized key AND (b) fall within
+    the configured scan region are removed. Body lines with identical text
+    are preserved.
+
+    Args:
+        structure: The document to filter.
+        detection: The detection result from detect_repeating_headers_footers().
+        config: The same config used for detection (needed for region bounds).
+
+    Returns:
+        A new DocumentStructure with header/footer lines removed.
+    """
+    if not detection.has_detections:
+        return structure
+
+    filtered_pages: List[PageStructure] = []
+
+    for page in structure.pages:
+        footer_boundary = page.height - config.footer_scan_height
+        new_blocks: List[TextBlock] = []
+        next_line_index = 0
+
+        for block in page.blocks:
+            new_lines: List[TextLine] = []
+
+            for line in block.lines:
+                text = line.text or ""
+                key = _normalize_for_grouping(text)
+
+                # Remove if line is in header region AND matches a detected header key
+                if (
+                    config.header_scan_height > 0
+                    and line.bbox[1] < config.header_scan_height
+                    and key in detection.header_keys
+                ):
+                    continue
+
+                # Remove if line is in footer region AND matches a detected footer key
+                if (
+                    config.footer_scan_height > 0
+                    and line.bbox[3] > footer_boundary
+                    and key in detection.footer_keys
+                ):
+                    continue
+
+                new_lines.append(
+                    TextLine(
+                        index=next_line_index,
+                        text=text,
+                        bbox=line.bbox,
+                        fonts=set(line.fonts),
+                        spans=list(line.spans),
+                    )
+                )
+                next_line_index += 1
+
+            if new_lines:
+                new_blocks.append(
+                    TextBlock(
+                        index=block.index,
+                        bbox=block.bbox,
+                        lines=new_lines,
+                    )
+                )
+
+        filtered_pages.append(
+            PageStructure(
+                page_number=page.page_number,
+                width=page.width,
+                height=page.height,
+                blocks=new_blocks,
+            )
+        )
+
+    return DocumentStructure(pages=filtered_pages, config=structure.config)
+
+
+def filter_headers_footers(
+    structure: DocumentStructure,
+    config: HeaderFooterConfig,
+) -> DocumentStructure:
+    """Convenience function: detect and strip in one call.
+
+    Equivalent to:
+        detection = detect_repeating_headers_footers(structure, config)
+        return strip_detected_headers_footers(structure, detection, config)
+
+    Args:
+        structure: The document to process.
+        config: Detection parameters.
+
+    Returns:
+        A new DocumentStructure with detected repeating headers/footers removed.
+        If config.enabled is False, returns the input unchanged.
+    """
+    if not config.enabled:
+        return structure
+    detection = detect_repeating_headers_footers(structure, config)
+    return strip_detected_headers_footers(structure, detection, config)
diff --git a/DocTest/PdfStructureComparator.py b/DocTest/PdfStructureComparator.py
index 54db9cd..1cc3a05 100644
--- a/DocTest/PdfStructureComparator.py
+++ b/DocTest/PdfStructureComparator.py
@@ -4,16 +4,18 @@
 from dataclasses import dataclass, field
 from typing import Dict, Iterable, List, Optional, Sequence, Tuple
 
-from DocTest.PdfStructureModels import DocumentStructure, PageStructure, TextLine
+from DocTest.PdfStructureModels import DocumentStructure, PageStructure, TextLine, WordToken
 
 
 __all__ = [
     "StructureTolerance",
     "LineDifference",
     "DocumentTextDifference",
+    "DocumentWordDifference",
     "StructureComparisonResult",
     "compare_document_structures",
     "compare_document_text_only",
+    "compare_document_words",
 ]
 
 
@@ -54,6 +56,20 @@ class DocumentTextDifference:
     cand_index: Optional[int] = None
 
 
+@dataclass
+class DocumentWordDifference:
+    """Details about word-level content mismatch in page-agnostic comparison."""
+
+    diff_type: str  # "missing_words", "extra_words", "word_mismatch"
+    message: str
+    ref_words: Optional[List[str]] = None
+    cand_words: Optional[List[str]] = None
+    ref_start_index: Optional[int] = None
+    ref_end_index: Optional[int] = None
+    cand_start_index: Optional[int] = None
+    cand_end_index: Optional[int] = None
+
+
 @dataclass
 class StructureComparisonResult:
     """Aggregate differences found during structure comparison."""
@@ -61,6 +77,7 @@ class StructureComparisonResult:
     passed: bool = True
     page_differences: Dict[int, List[LineDifference]] = field(default_factory=dict)
     document_differences: List[DocumentTextDifference] = field(default_factory=list)
+    word_differences: List[DocumentWordDifference] = field(default_factory=list)
     summary: List[str] = field(default_factory=list)
 
     def add_difference(self, diff: LineDifference):
@@ -72,13 +89,18 @@ def add_document_difference(self, diff: DocumentTextDifference):
         self.passed = False
         self.document_differences.append(diff)
 
+    def add_word_difference(self, diff: DocumentWordDifference):
+        """Add a document-level word difference."""
+        self.passed = False
+        self.word_differences.append(diff)
+
     def extend_summary(self, message: str):
         self.summary.append(message)
 
     def difference_count(self) -> int:
-        """Return total count of all differences (page-level and document-level)."""
+        """Return total count of all differences (page-level, document-level, and word-level)."""
         page_diff_count = sum(len(diffs) for diffs in self.page_differences.values())
-        return page_diff_count + len(self.document_differences)
+        return page_diff_count + len(self.document_differences) + len(self.word_differences)
 
 
 def compare_document_structures(
@@ -229,6 +251,210 @@ def compare_document_text_only(
     return result
 
 
+def _compare_words_unordered(
+    ref_words: List[str],
+    ref_originals: List[str],
+    cand_words: List[str],
+    cand_originals: List[str],
+) -> StructureComparisonResult:
+    """Compare words using bag-of-words (Counter-based) comparison.
+
+    This mode ignores word order entirely and only checks that both documents
+    contain the same words with the same frequencies. It is useful when text
+    reflows across pages cause identical content to appear in different order.
+
+    Excess words in the reference are reported as ``missing_words``, excess
+    words in the candidate as ``extra_words``.
+    """
+    from collections import Counter
+
+    result = StructureComparisonResult()
+
+    ref_counts = Counter(ref_words)
+    cand_counts = Counter(cand_words)
+
+    # Words that appear more in reference than candidate (missing from candidate)
+    ref_excess = ref_counts - cand_counts
+    # Words that appear more in candidate than reference (extra in candidate)
+    cand_excess = cand_counts - ref_counts
+
+    # Build original-text lists for reporting by scanning the original arrays
+    # and picking up excess instances
+    if ref_excess:
+        remaining = dict(ref_excess)
+        excess_originals: List[str] = []
+        for norm_word, orig_word in zip(ref_words, ref_originals):
+            if remaining.get(norm_word, 0) > 0:
+                excess_originals.append(orig_word)
+                remaining[norm_word] -= 1
+        if excess_originals:
+            preview = " ".join(excess_originals[:10])
+            if len(excess_originals) > 10:
+                preview += f" ... (+{len(excess_originals) - 10} more)"
+            result.add_word_difference(
+                DocumentWordDifference(
+                    diff_type="missing_words",
+                    message=f"Words in reference not found in candidate (unordered): '{_truncate_text(preview, 120)}'",
+                    ref_words=excess_originals,
+                    ref_start_index=0,
+                    ref_end_index=len(excess_originals),
+                )
+            )
+
+    if cand_excess:
+        remaining_cand = dict(cand_excess)
+        cand_excess_originals: List[str] = []
+        for norm_word, orig_word in zip(cand_words, cand_originals):
+            if remaining_cand.get(norm_word, 0) > 0:
+                cand_excess_originals.append(orig_word)
+                remaining_cand[norm_word] -= 1
+        if cand_excess_originals:
+            preview = " ".join(cand_excess_originals[:10])
+            if len(cand_excess_originals) > 10:
+                preview += f" ... (+{len(cand_excess_originals) - 10} more)"
+            result.add_word_difference(
+                DocumentWordDifference(
+                    diff_type="extra_words",
+                    message=f"Extra words in candidate not found in reference (unordered): '{_truncate_text(preview, 120)}'",
+                    cand_words=cand_excess_originals,
+                    cand_start_index=0,
+                    cand_end_index=len(cand_excess_originals),
+                )
+            )
+
+    return result
+
+
+def compare_document_words(
+    reference: DocumentStructure,
+    candidate: DocumentStructure,
+    *,
+    case_sensitive: bool = True,
+    normalize_ligatures: bool = False,
+    normalize_word_boundaries: bool = False,
+    compare_order: str = "ordered",
+) -> StructureComparisonResult:
+    """Compare document text at the word level, ignoring line and page boundaries.
+
+    Flattens all text into word tokens and uses SequenceMatcher to detect
+    insertions, deletions, and replacements at word granularity. Contiguous
+    diff opcodes of the same type are grouped into single difference records
+    for cleaner reporting.
+
+    Args:
+        reference: The reference document structure.
+        candidate: The candidate document structure to compare.
+        case_sensitive: Whether word comparison is case-sensitive.
+        normalize_ligatures: When True, replace known typographic ligatures
+            with their ASCII equivalents in each word before comparison.
+        normalize_word_boundaries: When True, merge tokens that were split
+            across line boundaries by connector characters (``/``, ``-``, ``\\``).
+        compare_order: Comparison strategy. ``"ordered"`` (default) uses
+            SequenceMatcher for sequence-sensitive comparison. ``"unordered"``
+            uses Counter-based bag-of-words comparison that ignores word order,
+            useful when text reflows across pages.
+
+    Returns:
+        A StructureComparisonResult with document-level word differences.
+    """
+    from DocTest.PdfStructureModels import flatten_document_words
+
+    result = StructureComparisonResult()
+
+    ref_words, ref_tokens = flatten_document_words(
+        reference,
+        normalize_word_boundaries=normalize_word_boundaries,
+        normalize_ligatures_in_words=normalize_ligatures,
+    )
+    cand_words, cand_tokens = flatten_document_words(
+        candidate,
+        normalize_word_boundaries=normalize_word_boundaries,
+        normalize_ligatures_in_words=normalize_ligatures,
+    )
+
+    # Preserve originals for reporting before potential case normalization
+    ref_originals = list(ref_words)
+    cand_originals = list(cand_words)
+
+    if not case_sensitive:
+        ref_words = [w.lower() for w in ref_words]
+        cand_words = [w.lower() for w in cand_words]
+
+    if compare_order == "unordered":
+        return _compare_words_unordered(ref_words, ref_originals, cand_words, cand_originals)
+
+    matcher = difflib.SequenceMatcher(a=ref_words, b=cand_words, autojunk=False)
+
+    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
+        if tag == "equal":
+            continue
+
+        ref_slice = ref_originals[i1:i2] if i1 < i2 else None
+        cand_slice = cand_originals[j1:j2] if j1 < j2 else None
+
+        if tag == "replace":
+            ref_preview = " ".join(ref_slice) if ref_slice else ""
+            cand_preview = " ".join(cand_slice) if cand_slice else ""
+            message = (
+                f"Word mismatch at positions {i1}-{i2 - 1}: "
+                f"reference='{_truncate_text(ref_preview, 80)}', "
+                f"candidate='{_truncate_text(cand_preview, 80)}'"
+            )
+            result.add_word_difference(
+                DocumentWordDifference(
+                    diff_type="word_mismatch",
+                    message=message,
+                    ref_words=ref_slice,
+                    cand_words=cand_slice,
+                    ref_start_index=i1,
+                    ref_end_index=i2,
+                    cand_start_index=j1,
+                    cand_end_index=j2,
+                )
+            )
+
+        elif tag == "delete":
+            ref_preview = " ".join(ref_slice) if ref_slice else ""
+            message = (
+                f"Words missing in candidate at positions {i1}-{i2 - 1}: "
+                f"'{_truncate_text(ref_preview, 80)}'"
+            )
+            result.add_word_difference(
+                DocumentWordDifference(
+                    diff_type="missing_words",
+                    message=message,
+                    ref_words=ref_slice,
+                    ref_start_index=i1,
+                    ref_end_index=i2,
+                )
+            )
+
+        elif tag == "insert":
+            cand_preview = " ".join(cand_slice) if cand_slice else ""
+            message = (
+                f"Extra words in candidate at positions {j1}-{j2 - 1}: "
+                f"'{_truncate_text(cand_preview, 80)}'"
+            )
+            result.add_word_difference(
+                DocumentWordDifference(
+                    diff_type="extra_words",
+                    message=message,
+                    cand_words=cand_slice,
+                    cand_start_index=j1,
+                    cand_end_index=j2,
+                )
+            )
+
+    return result
+
+
+def _truncate_text(text: str, max_length: int) -> str:
+    """Truncate text with ellipsis if it exceeds max_length."""
+    if len(text) <= max_length:
+        return text
+    return text[: max_length - 3] + "..."
+
+
 def _compare_page(
     ref_page: PageStructure,
     cand_page: PageStructure,
diff --git a/DocTest/PdfStructureModels.py b/DocTest/PdfStructureModels.py
index 8bfa6ef..153ae61 100644
--- a/DocTest/PdfStructureModels.py
+++ b/DocTest/PdfStructureModels.py
@@ -13,11 +13,14 @@
     "PageStructure",
     "DocumentStructure",
     "StructureExtractionConfig",
+    "WordToken",
     "strip_font_subset",
     "collapse_whitespace",
     "round_bbox",
     "build_page_structure",
+    "build_page_structure_from_words",
     "flatten_document_text",
+    "flatten_document_words",
 ]
 
 
@@ -80,6 +83,7 @@ class StructureExtractionConfig:
     round_precision: Optional[int] = 3
     normalize_ligatures: bool = False
     character_replacements: Optional[Dict[str, str]] = None
+    spatial_word_sorting: bool = False
 
     def __hash__(self) -> int:  # Allow usage as dictionary key for caching.
         # Convert character_replacements dict to a hashable tuple of sorted items
@@ -98,6 +102,7 @@ def __hash__(self) -> int:  # Allow usage as dictionary key for caching.
                 self.round_precision,
                 self.normalize_ligatures,
                 replacements_hash,
+                self.spatial_word_sorting,
             )
         )
 
@@ -114,6 +119,15 @@ def page_count(self) -> int:
         return len(self.pages)
 
 
+@dataclass(frozen=True)
+class WordToken:
+    """A single word token extracted from a document, with provenance metadata."""
+    text: str
+    source_page: int
+    source_line_index: int
+    word_index: int
+
+
 def flatten_document_text(structure: DocumentStructure) -> List[str]:
     """Extract all text lines from a document in reading order, ignoring page boundaries.
 
@@ -137,6 +151,77 @@ def flatten_document_text(structure: DocumentStructure) -> List[str]:
     return texts
 
 
+def flatten_document_words(
+    structure: DocumentStructure,
+    *,
+    normalize_word_boundaries: bool = False,
+    normalize_ligatures_in_words: bool = False,
+) -> Tuple[List[str], List[WordToken]]:
+    """Extract all words from a document in reading order, ignoring page/line boundaries.
+
+    Splits every text line on whitespace to produce individual word tokens.
+    This enables comparison at word granularity, making the comparison resilient
+    to text reflow caused by font or layout changes.
+
+    Args:
+        structure: A DocumentStructure containing pages with text blocks and lines.
+        normalize_word_boundaries: When True, merge tokens that were split
+            across line boundaries by connector characters (``/``, ``-``, ``\\``).
+        normalize_ligatures_in_words: When True, replace known typographic
+            ligatures with their ASCII equivalents in each word.
+
+    Returns:
+        A tuple of:
+        - words: Flat list of word strings for use with SequenceMatcher.
+        - tokens: Corresponding list of WordToken objects with provenance.
+    """
+    words: List[str] = []
+    tokens: List[WordToken] = []
+    global_line_index = 0
+    word_index = 0
+
+    for page in structure.pages:
+        for block in page.blocks:
+            for line in block.lines:
+                if not line.text:
+                    global_line_index += 1
+                    continue
+                line_words = line.text.split()
+                for w in line_words:
+                    words.append(w)
+                    tokens.append(
+                        WordToken(
+                            text=w,
+                            source_page=page.page_number,
+                            source_line_index=global_line_index,
+                            word_index=word_index,
+                        )
+                    )
+                    word_index += 1
+                global_line_index += 1
+
+    # Apply ligature normalization to individual words if requested
+    if normalize_ligatures_in_words:
+        from DocTest.TextNormalization import normalize_ligatures
+        words = [normalize_ligatures(w) for w in words]
+        tokens = [
+            WordToken(
+                text=normalize_ligatures(t.text),
+                source_page=t.source_page,
+                source_line_index=t.source_line_index,
+                word_index=t.word_index,
+            )
+            for t in tokens
+        ]
+
+    # Merge words split across line boundaries
+    if normalize_word_boundaries:
+        from DocTest.TextNormalization import merge_split_words
+        words, tokens = merge_split_words(words, tokens)
+
+    return words, tokens
+
+
 def strip_font_subset(font_name: Optional[str]) -> Optional[str]:
     """Drop random subset prefixes inserted by PDF generators."""
 
@@ -282,3 +367,151 @@ def build_page_structure(
         height=height,
         blocks=blocks,
     )
+
+
+def build_page_structure_from_words(
+    page_number: int,
+    pdf_text_words: Optional[List],
+    config: Optional[StructureExtractionConfig] = None,
+    *,
+    page_width: float = 0.0,
+    page_height: float = 0.0,
+    dpi: Optional[int] = None,
+    image_shape: Optional[Tuple[int, int, int]] = None,
+) -> PageStructure:
+    """Build a ``PageStructure`` from PyMuPDF ``get_text("words")`` output.
+
+    This bypasses block-level extraction entirely, grouping individual word
+    bounding boxes into lines using adaptive Y-proximity.  The result is
+    immune to block fragmentation caused by different PDF generators.
+
+    Each word tuple from PyMuPDF has the form::
+
+        (x0, y0, x1, y1, "word", block_no, line_no, word_no)
+
+    Words are grouped into lines when their vertical midpoints are within
+    half the minimum word height of each other.  Within each line, words
+    are ordered left-to-right by ``x0``.  Lines are ordered top-to-bottom.
+    Each line becomes its own ``TextBlock`` (single-line blocks).
+
+    Args:
+        page_number: Zero-based page index.
+        pdf_text_words: List of word tuples from ``page.get_text("words")``.
+        config: Normalization settings (whitespace, ligatures, etc.).
+        page_width: Page width in points.
+        page_height: Page height in points.
+        dpi: Optional DPI for computing page dimensions from ``image_shape``.
+        image_shape: ``(height, width, channels)`` array shape, used with *dpi*
+            to derive page dimensions when ``page_width``/``page_height`` are zero.
+
+    Returns:
+        A ``PageStructure`` with one block per reconstructed text line.
+    """
+    config = config or StructureExtractionConfig()
+
+    width = page_width
+    height = page_height
+    if (width == 0.0 or height == 0.0) and image_shape and dpi:
+        px_height, px_width = image_shape[:2]
+        width = px_width * 72.0 / dpi
+        height = px_height * 72.0 / dpi
+
+    if not pdf_text_words:
+        return PageStructure(
+            page_number=page_number,
+            width=width,
+            height=height,
+            blocks=[],
+        )
+
+    # --- Group words into visual lines by Y-proximity ---
+    # Sort by vertical midpoint first, then horizontal position.
+    sorted_words = sorted(pdf_text_words, key=lambda w: ((w[1] + w[3]) / 2.0, w[0]))
+
+    lines: List[List] = []  # Each element: list of word tuples
+    line_y_mid: List[float] = []  # Representative Y midpoint per line
+    line_min_height: List[float] = []  # Cached minimum word height per line
+
+    for word in sorted_words:
+        w_y0, w_y1 = float(word[1]), float(word[3])
+        w_mid = (w_y0 + w_y1) / 2.0
+        w_height = max(w_y1 - w_y0, 1.0)
+
+        # Search backward from most recent line (words are Y-sorted, so the
+        # most recent line is the most likely match).  Break early once we
+        # move past the tolerance range.
+        merged = False
+        max_possible_tolerance = w_height * 0.5
+        for idx in range(len(lines) - 1, -1, -1):
+            ly_mid = line_y_mid[idx]
+            delta = abs(w_mid - ly_mid)
+            if delta > max_possible_tolerance and w_mid > ly_mid:
+                break  # Past tolerance; earlier lines are even further away.
+            tolerance = min(line_min_height[idx], w_height) * 0.5
+            if delta <= tolerance:
+                lines[idx].append(word)
+                n = len(lines[idx])
+                line_y_mid[idx] = ly_mid + (w_mid - ly_mid) / n
+                if w_height < line_min_height[idx]:
+                    line_min_height[idx] = w_height
+                merged = True
+                break
+        if not merged:
+            lines.append([word])
+            line_y_mid.append(w_mid)
+            line_min_height.append(w_height)
+
+    # Sort lines top-to-bottom by midpoint, words left-to-right within each.
+    indexed_lines = sorted(enumerate(lines), key=lambda pair: line_y_mid[pair[0]])
+
+    blocks: List[TextBlock] = []
+    global_line_index = 0
+    block_index = 0
+
+    for _orig_idx, line_words in indexed_lines:
+        line_words_sorted = sorted(line_words, key=lambda w: float(w[0]))
+
+        # Build text from words, applying normalization.
+        text_parts: List[str] = []
+        for w in line_words_sorted:
+            raw = str(w[4])
+            normalized = _sanitize_span_text(raw, config)
+            if normalized:
+                text_parts.append(normalized)
+
+        line_text = config.whitespace_replacement.join(text_parts) if text_parts else ""
+        if config.strip_line_edges:
+            line_text = line_text.strip()
+        if config.drop_empty_lines and not line_text:
+            continue
+
+        # Compute line bbox as union of all word bboxes.
+        x0 = min(float(w[0]) for w in line_words_sorted)
+        y0 = min(float(w[1]) for w in line_words_sorted)
+        x1 = max(float(w[2]) for w in line_words_sorted)
+        y1 = max(float(w[3]) for w in line_words_sorted)
+        bbox = round_bbox((x0, y0, x1, y1), config.round_precision)
+
+        text_line = TextLine(
+            index=global_line_index,
+            text=line_text,
+            bbox=bbox,
+            fonts=set(),
+            spans=[TextSpan(text=line_text, font=None, size=0.0)],
+        )
+        blocks.append(
+            TextBlock(
+                index=block_index,
+                bbox=bbox,
+                lines=[text_line],
+            )
+        )
+        global_line_index += 1
+        block_index += 1
+
+    return PageStructure(
+        page_number=page_number,
+        width=width,
+        height=height,
+        blocks=blocks,
+    )
diff --git a/DocTest/PdfTest.py b/DocTest/PdfTest.py
index d617fd7..8f07624 100644
--- a/DocTest/PdfTest.py
+++ b/DocTest/PdfTest.py
@@ -15,7 +15,9 @@
     StructureTolerance,
     compare_document_structures,
     compare_document_text_only,
+    compare_document_words,
 )
+from DocTest.HeaderFooterDetector import HeaderFooterConfig, filter_headers_footers
 from DocTest.PdfStructureModels import (
     DocumentStructure,
     PageStructure,
@@ -226,6 +228,10 @@ def compare_pdf_documents(self, reference_document, candidate_document, **kwargs
         mask_value = kwargs.pop('mask', None)
         text_mask_patterns_arg = kwargs.pop('text_mask_patterns', None)
         ignore_ligatures = _as_bool(kwargs.pop('ignore_ligatures', False))
+        normalize_word_boundaries = _as_bool(kwargs.pop('normalize_word_boundaries', False), False)
+        compare_order = kwargs.pop('compare_order', 'ordered')
+        if compare_order not in ('ordered', 'unordered'):
+            compare_order = 'ordered'
         check_pdf_text = _as_bool(kwargs.pop('check_pdf_text', False))
 
         # Parse character_replacements from kwargs or use instance default
@@ -267,9 +273,19 @@ def compare_pdf_documents(self, reference_document, candidate_document, **kwargs
 
         # New parameters for controlling structure comparison behavior
         ignore_page_boundaries = _as_bool(kwargs.pop('ignore_page_boundaries', False), False)
+        compare_word_level = _as_bool(kwargs.pop('compare_word_level', True), True)
         check_geometry = _as_bool(kwargs.pop('check_geometry', True), True)
         check_block_count = _as_bool(kwargs.pop('check_block_count', True), True)
 
+        header_scan_height = _as_float(kwargs.pop('header_scan_height', 0), 0)
+        footer_scan_height = _as_float(kwargs.pop('footer_scan_height', 0), 0)
+        header_repeat_threshold = int(_as_float(kwargs.pop('header_repeat_threshold', 2), 2))
+        header_footer_config = HeaderFooterConfig(
+            header_scan_height=header_scan_height,
+            footer_scan_height=footer_scan_height,
+            repeat_threshold=header_repeat_threshold,
+        )
+
         # When ignoring page boundaries, disable geometry and block count checks
         if ignore_page_boundaries:
             check_geometry = False
@@ -456,29 +472,29 @@ def _record_diff(facet: str, description: str, diff_payload: Any):
                     candidate_representation=candidate_repr,
                     text_mask_patterns=compiled_text_patterns,
                     ignore_page_boundaries=ignore_page_boundaries,
+                    compare_word_level=compare_word_level,
                     check_geometry=check_geometry,
                     check_block_count=check_block_count,
+                    header_footer_config=header_footer_config,
+                    normalize_word_boundaries=normalize_word_boundaries,
+                    compare_order=compare_order,
                 )
                 if not structure_result.passed:
                     differences_detected = True
                     summary = getattr(structure_result, "summary", None)
                     page_diffs = getattr(structure_result, "page_differences", None)
                     doc_diffs = getattr(structure_result, "document_differences", None)
-                    details_parts: List[str] = []
-                    if summary:
-                        details_parts.extend(str(item) for item in summary)
-                    if page_diffs:
-                        for page, diffs in page_diffs.items():
-                            for diff in diffs:
-                                details_parts.append(f"Page {page}: {diff.message}")
-                    if doc_diffs:
-                        for diff in doc_diffs:
-                            details_parts.append(f"Document: {diff.message}")
+                    try:
+                        from DocTest.StructureReportBuilder import build_structure_report_plain_text
+                        plain_report = build_structure_report_plain_text(structure_result)
+                        detail_text = plain_report if plain_report else "Structure comparison differences detected."
+                    except Exception:
+                        detail_text = "Structure comparison differences detected."
                     llm_differences.append(
                         {
                             "facet": "structure",
                             "description": "PDF structural comparison failed.",
-                            "details": "\n".join(details_parts) if details_parts else "Structure comparison differences detected.",
+                            "details": detail_text,
                         }
                     )
 
@@ -557,8 +573,11 @@ def compare_pdf_structure(self, reference_document, candidate_document, **kwargs
             - ``text_mask_patterns``: regex or list of regex strings to skip lines during comparison.
             - ``ignore_ligatures`` (bool, default ``False``): normalise common ligatures (``ﬁ`` → ``fi``) prior to comparison.
             - ``ignore_page_boundaries`` (bool, default ``False``): ignore page breaks and compare text content in reading order across the entire document. When enabled, geometry and block structure are not checked. Useful when font/size changes cause text to reflow across pages.
+            - ``normalize_word_boundaries`` (bool, default ``False``): merge words split across line boundaries by connector characters (``/``, ``-``, ``\\``). Recommended when using ``ignore_page_boundaries``.
+            - ``compare_order`` (str, default ``"ordered"``): comparison strategy for word-level comparison. ``"ordered"`` uses sequence-sensitive matching; ``"unordered"`` uses bag-of-words frequency comparison that ignores word order, useful when text reflows across pages.
             - ``check_geometry`` (bool, default ``True``): when ``False``, skip line position/size comparison. Useful for comparing content when layout may differ. Automatically set to ``False`` when ``ignore_page_boundaries`` is ``True``.
             - ``check_block_count`` (bool, default ``True``): when ``False``, skip block count validation per page. Automatically set to ``False`` when ``ignore_page_boundaries`` is ``True``.
+            - ``spatial_word_sorting`` (bool, default ``False``): when ``True``, build page structure from individual word bounding boxes instead of text blocks. This bypasses block fragmentation differences caused by different PDF generators and produces consistent word ordering. Recommended when ``ignore_page_boundaries`` is ``True``.
 
         Examples:
         | `Compare Pdf Structure`    reference.pdf    candidate.pdf
@@ -566,6 +585,7 @@ def compare_pdf_structure(self, reference_document, candidate_document, **kwargs
         | `Compare Pdf Structure`    reference.pdf    candidate.pdf    mask=${CURDIR}${/}mask.json    text_mask_patterns=\\d{4}-\\d{4}    ignore_ligatures=${True}
         | `Compare Pdf Structure`    reference.pdf    candidate.pdf    ignore_page_boundaries=${True}
         | `Compare Pdf Structure`    reference.pdf    candidate.pdf    check_geometry=${False}    check_block_count=${False}
+        | `Compare Pdf Structure`    reference.pdf    candidate.pdf    ignore_page_boundaries=${True}    spatial_word_sorting=${True}
         | `Run Keyword And Expect Error`    The compared PDF structure is different.    Compare Pdf Structure    reference.pdf    candidate_with_changed_text.pdf
 
         """
@@ -589,6 +609,10 @@ def compare_pdf_structure(self, reference_document, candidate_document, **kwargs
         mask_value = kwargs.get('mask')
         text_mask_patterns_arg = kwargs.get('text_mask_patterns')
         ignore_ligatures = _as_bool(kwargs.get('ignore_ligatures', False), False)
+        normalize_word_boundaries = _as_bool(kwargs.get('normalize_word_boundaries', False), False)
+        compare_order = kwargs.get('compare_order', 'ordered')
+        if compare_order not in ('ordered', 'unordered'):
+            compare_order = 'ordered'
 
         # Parse character_replacements from kwargs or use instance default
         char_replacements_arg = kwargs.get('character_replacements')
@@ -598,8 +622,19 @@ def compare_pdf_structure(self, reference_document, candidate_document, **kwargs
 
         # New parameters for controlling comparison behavior
         ignore_page_boundaries = _as_bool(kwargs.get('ignore_page_boundaries', False), False)
+        compare_word_level = _as_bool(kwargs.get('compare_word_level', True), True)
         check_geometry = _as_bool(kwargs.get('check_geometry', True), True)
         check_block_count = _as_bool(kwargs.get('check_block_count', True), True)
+        spatial_word_sorting = _as_bool(kwargs.get('spatial_word_sorting', False), False)
+
+        header_scan_height = _as_float(kwargs.get('header_scan_height', 0), 0)
+        footer_scan_height = _as_float(kwargs.get('footer_scan_height', 0), 0)
+        header_repeat_threshold = int(_as_float(kwargs.get('header_repeat_threshold', 2), 2))
+        header_footer_config = HeaderFooterConfig(
+            header_scan_height=header_scan_height,
+            footer_scan_height=footer_scan_height,
+            repeat_threshold=header_repeat_threshold,
+        )
 
         # When ignoring page boundaries, disable geometry and block count checks
         if ignore_page_boundaries:
@@ -615,6 +650,7 @@ def compare_pdf_structure(self, reference_document, candidate_document, **kwargs
             round_precision=round_precision,
             normalize_ligatures=ignore_ligatures,
             character_replacements=char_replacements,
+            spatial_word_sorting=spatial_word_sorting,
         )
         tolerance = StructureTolerance(
             position=position_tolerance,
@@ -655,8 +691,12 @@ def compare_pdf_structure(self, reference_document, candidate_document, **kwargs
                 candidate_representation=candidate_repr,
                 text_mask_patterns=compiled_text_patterns,
                 ignore_page_boundaries=ignore_page_boundaries,
+                compare_word_level=compare_word_level,
                 check_geometry=check_geometry,
                 check_block_count=check_block_count,
+                header_footer_config=header_footer_config,
+                normalize_word_boundaries=normalize_word_boundaries,
+                compare_order=compare_order,
             )
         finally:
             reference_repr.close()
@@ -969,8 +1009,12 @@ def _perform_structure_comparison(
         candidate_representation: Optional[DocumentRepresentation] = None,
         text_mask_patterns: Optional[List[Pattern[str]]] = None,
         ignore_page_boundaries: bool = False,
+        compare_word_level: bool = True,
         check_geometry: bool = True,
         check_block_count: bool = True,
+        header_footer_config: Optional["HeaderFooterConfig"] = None,
+        normalize_word_boundaries: bool = False,
+        compare_order: str = "ordered",
     ):
         release_reference = False
         release_candidate = False
@@ -985,17 +1029,31 @@ def _perform_structure_comparison(
             reference_structure = reference_representation.get_pdf_structure(config=extraction_config)
             candidate_structure = candidate_representation.get_pdf_structure(config=extraction_config)
 
+            # Repetition-based header/footer detection
+            if header_footer_config and header_footer_config.enabled:
+                reference_structure = filter_headers_footers(reference_structure, header_footer_config)
+                candidate_structure = filter_headers_footers(candidate_structure, header_footer_config)
+
             if text_mask_patterns:
                 reference_structure = self._prune_structure_lines(reference_structure, text_mask_patterns)
                 candidate_structure = self._prune_structure_lines(candidate_structure, text_mask_patterns)
 
             if ignore_page_boundaries:
-                # Use text-only comparison that ignores page boundaries
-                result = compare_document_text_only(
-                    reference=reference_structure,
-                    candidate=candidate_structure,
-                    case_sensitive=case_sensitive,
-                )
+                if compare_word_level:
+                    result = compare_document_words(
+                        reference=reference_structure,
+                        candidate=candidate_structure,
+                        case_sensitive=case_sensitive,
+                        normalize_ligatures=extraction_config.normalize_ligatures,
+                        normalize_word_boundaries=normalize_word_boundaries,
+                        compare_order=compare_order,
+                    )
+                else:
+                    result = compare_document_text_only(
+                        reference=reference_structure,
+                        candidate=candidate_structure,
+                        case_sensitive=case_sensitive,
+                    )
             else:
                 # Use standard page-by-page comparison
                 result = compare_document_structures(
@@ -1006,7 +1064,45 @@ def _perform_structure_comparison(
                     check_geometry=check_geometry,
                     check_block_count=check_block_count,
                 )
-            self._log_structure_result(result, ignore_page_boundaries=ignore_page_boundaries)
+            # Capture text lists for context display in the report
+            ref_texts = None
+            cand_texts = None
+            try:
+                from DocTest.PdfStructureModels import flatten_document_text
+                ref_texts = flatten_document_text(reference_structure)
+                cand_texts = flatten_document_text(candidate_structure)
+            except Exception:
+                pass
+
+            exclusions = []
+            if text_mask_patterns:
+                exclusions.extend(f"text_mask: {p.pattern}" for p in text_mask_patterns)
+            if header_footer_config and header_footer_config.enabled:
+                if header_footer_config.header_scan_height > 0:
+                    exclusions.append(f"header_filter: {header_footer_config.header_scan_height}pt")
+                if header_footer_config.footer_scan_height > 0:
+                    exclusions.append(f"footer_filter: {header_footer_config.footer_scan_height}pt")
+            # Only report disabled checks when explicitly set by the user,
+            # not when auto-disabled by ignore_page_boundaries
+            if not ignore_page_boundaries:
+                if not check_geometry:
+                    exclusions.append("check_geometry: False")
+                if not check_block_count:
+                    exclusions.append("check_block_count: False")
+            if normalize_word_boundaries:
+                exclusions.append("normalize_word_boundaries: True")
+            if compare_order == "unordered":
+                exclusions.append("compare_order: unordered")
+
+            self._log_structure_result(
+                result,
+                ignore_page_boundaries=ignore_page_boundaries,
+                reference_name=Path(reference_document).name,
+                candidate_name=Path(candidate_document).name,
+                reference_texts=ref_texts,
+                candidate_texts=cand_texts,
+                exclusions_applied=exclusions,
+            )
             return result
         finally:
             if release_reference:
@@ -1060,12 +1156,23 @@ def _prune_structure_lines(
             )
         return DocumentStructure(pages=filtered_pages, config=structure.config)
 
-    def _log_structure_result(self, result, ignore_page_boundaries: bool = False):
-        """Log comparison results with single summary WARN and detail INFO messages.
+    def _log_structure_result(
+        self,
+        result,
+        *,
+        ignore_page_boundaries: bool = False,
+        reference_name: str = "",
+        candidate_name: str = "",
+        reference_texts: Optional[List[str]] = None,
+        candidate_texts: Optional[List[str]] = None,
+        exclusions_applied: Optional[List[str]] = None,
+    ):
+        """Log comparison results with single summary WARN, HTML report INFO, and detail DEBUG.
 
         Robot Framework displays WARN messages at the top of log.html. To avoid
-        cluttering that section, we emit a single summary warning and log all
-        individual differences as INFO (visible only within keyword output).
+        cluttering that section, we emit a single summary warning. All differences
+        are rendered as a single consolidated HTML report at INFO level. Individual
+        per-difference output is preserved at DEBUG level for troubleshooting.
         """
         if result.passed:
             logger.info("[PDF Structure] Documents match within configured tolerances.")
@@ -1075,15 +1182,34 @@ def _log_structure_result(self, result, ignore_page_boundaries: bool = False):
         diff_count = result.difference_count()
         mode = "text-only (ignoring page boundaries)" if ignore_page_boundaries else "structure"
 
-        # Single summary warning (appears at top of log.html)
+        # Single summary warning (appears at top of log.html) -- UNCHANGED
         logger.warn(f"[PDF Structure] Comparison failed: {diff_count} difference(s) found in {mode} comparison.")
 
-        # Log summary entries as INFO
+        # --- Consolidated HTML report at INFO level ---
+        try:
+            from DocTest.StructureReportBuilder import ReportMetadata, build_structure_report
+            metadata = ReportMetadata(
+                reference_name=reference_name or "(unknown)",
+                candidate_name=candidate_name or "(unknown)",
+                comparison_mode=mode,
+                exclusions_applied=exclusions_applied or [],
+            )
+            html_report = build_structure_report(
+                result,
+                metadata=metadata,
+                reference_texts=reference_texts,
+                candidate_texts=candidate_texts,
+            )
+            if html_report:
+                logger.info(html_report, html=True)
+        except Exception:
+            pass  # Degrade gracefully if report builder fails
+
+        # --- Per-difference output at DEBUG level ---
         if result.summary:
             for entry in result.summary:
-                logger.info(f"[PDF Structure] {entry}")
+                logger.debug(f"[PDF Structure] {entry}")
 
-        # Log page differences as INFO
         if result.page_differences:
             for page in sorted(result.page_differences.keys()):
                 for diff in result.page_differences[page]:
@@ -1095,12 +1221,11 @@ def _log_structure_result(self, result, ignore_page_boundaries: bool = False):
                         details.append(f"candidate line={diff.candidate_index}")
                     if details:
                         message = f"{message} ({', '.join(details)})"
-                    logger.info(message)
+                    logger.debug(message)
                     if diff.deltas:
                         pretty = ", ".join(f"{axis}={value:.3f}" for axis, value in diff.deltas.items())
                         logger.debug(f"[PDF Structure] Page {page} deltas: {pretty}")
 
-        # Log document-level differences as INFO (for text-only mode)
         if result.document_differences:
             for diff in result.document_differences:
                 message = f"[PDF Text] {diff.message}"
@@ -1111,7 +1236,20 @@ def _log_structure_result(self, result, ignore_page_boundaries: bool = False):
                     details.append(f"candidate position={diff.cand_index}")
                 if details:
                     message = f"{message} ({', '.join(details)})"
-                logger.info(message)
+                logger.debug(message)
+
+        # Log word-level differences at DEBUG
+        if hasattr(result, 'word_differences') and result.word_differences:
+            for diff in result.word_differences:
+                message = f"[PDF Words] {diff.message}"
+                details = []
+                if diff.ref_start_index is not None:
+                    details.append(f"ref positions {diff.ref_start_index}-{diff.ref_end_index}")
+                if diff.cand_start_index is not None:
+                    details.append(f"cand positions {diff.cand_start_index}-{diff.cand_end_index}")
+                if details:
+                    message = f"{message} ({', '.join(details)})"
+                logger.debug(message)
 
     def _ensure_local_document(self, document):
         return download_file_from_url(document) if is_url(document) else document
diff --git a/DocTest/StructureReportBuilder.py b/DocTest/StructureReportBuilder.py
new file mode 100644
index 0000000..6e892d2
--- /dev/null
+++ b/DocTest/StructureReportBuilder.py
@@ -0,0 +1,558 @@
+"""Consolidated HTML report builder for PDF structure comparison results.
+
+Transforms a StructureComparisonResult into a single HTML fragment suitable
+for rendering inside Robot Framework's log.html via logger.info(msg, html=True).
+"""
+
+from __future__ import annotations
+
+import html as html_module
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+
+from DocTest.PdfStructureComparator import (
+    DocumentTextDifference,
+    DocumentWordDifference,
+    LineDifference,
+    StructureComparisonResult,
+)
+
+__all__ = [
+    "build_structure_report",
+    "build_structure_report_plain_text",
+    "ReportMetadata",
+]
+
+DEFAULT_CONTEXT_LINES = 3
+MAX_TEXT_DISPLAY_LENGTH = 120
+MAX_HUNKS_BEFORE_COLLAPSE = 20
+
+
+@dataclass
+class ReportMetadata:
+    """Metadata displayed in the report header."""
+    reference_name: str = ""
+    candidate_name: str = ""
+    comparison_mode: str = ""
+    page_count_ref: Optional[int] = None
+    page_count_cand: Optional[int] = None
+    exclusions_applied: List[str] = field(default_factory=list)
+
+
+@dataclass
+class ReportSummary:
+    """Aggregate statistics for the comparison."""
+    total_differences: int = 0
+    missing_count: int = 0
+    extra_count: int = 0
+    mismatch_count: int = 0
+    geometry_count: int = 0
+    other_count: int = 0
+    hunk_count: int = 0
+
+
+def _escape(text: str) -> str:
+    return html_module.escape(str(text), quote=True)
+
+
+def _truncate(text: str, max_length: int = MAX_TEXT_DISPLAY_LENGTH) -> str:
+    if len(text) <= max_length:
+        return text
+    return text[: max_length - 3] + "..."
+
+
+def _classify_diff_type(diff_type: str) -> str:
+    """Map diff_type string to category."""
+    if diff_type in ("missing_line", "missing_text", "missing_page", "missing_words"):
+        return "missing"
+    elif diff_type in ("extra_line", "extra_text", "extra_page", "extra_words"):
+        return "extra"
+    elif diff_type in ("text_mismatch", "word_mismatch"):
+        return "mismatch"
+    elif diff_type == "geometry_mismatch":
+        return "geometry"
+    else:
+        return "other"
+
+
+def _get_diff_display(diff: Any) -> Tuple[str, str, Optional[str], Optional[str]]:
+    """Extract category, message, ref_text, cand_text from any diff type."""
+    category = _classify_diff_type(diff.diff_type)
+    message = diff.message
+
+    ref_text = None
+    cand_text = None
+
+    if isinstance(diff, LineDifference):
+        ref_text = diff.ref_text
+        cand_text = diff.cand_text
+    elif isinstance(diff, DocumentTextDifference):
+        ref_text = diff.ref_text
+        cand_text = diff.cand_text
+    elif isinstance(diff, DocumentWordDifference):
+        ref_text = " ".join(diff.ref_words) if diff.ref_words else None
+        cand_text = " ".join(diff.cand_words) if diff.cand_words else None
+
+    return category, message, ref_text, cand_text
+
+
+_CATEGORY_STYLES = {
+    # (background, text_color, symbol) — chosen for WCAG AA contrast
+    "missing": ("#f8d7da", "#721c24", "-"),
+    "extra": ("#d4edda", "#155724", "+"),
+    "mismatch": ("#fff3cd", "#856404", "~"),
+    "geometry": ("#e2e3e5", "#383d41", "\u0394"),  # delta symbol
+    "other": ("#e2e3e5", "#383d41", "!"),
+}
+
+
+def _compute_summary(result: StructureComparisonResult) -> ReportSummary:
+    """Compute aggregate statistics from a comparison result."""
+    summary = ReportSummary()
+
+    for diffs in result.page_differences.values():
+        for d in diffs:
+            cat = _classify_diff_type(d.diff_type)
+            if cat == "missing": summary.missing_count += 1
+            elif cat == "extra": summary.extra_count += 1
+            elif cat == "mismatch": summary.mismatch_count += 1
+            elif cat == "geometry": summary.geometry_count += 1
+            else: summary.other_count += 1
+
+    for d in result.document_differences:
+        cat = _classify_diff_type(d.diff_type)
+        if cat == "missing": summary.missing_count += 1
+        elif cat == "extra": summary.extra_count += 1
+        elif cat == "mismatch": summary.mismatch_count += 1
+        else: summary.other_count += 1
+
+    if hasattr(result, 'word_differences'):
+        for d in result.word_differences:
+            cat = _classify_diff_type(d.diff_type)
+            if cat == "missing": summary.missing_count += 1
+            elif cat == "extra": summary.extra_count += 1
+            elif cat == "mismatch": summary.mismatch_count += 1
+            else: summary.other_count += 1
+
+    summary.total_differences = (
+        summary.missing_count + summary.extra_count +
+        summary.mismatch_count + summary.geometry_count + summary.other_count
+    )
+    return summary
+
+
+def _render_diff_html(diff: Any) -> str:
+    """Render a single difference as an HTML div with color coding."""
+    category, message, ref_text, cand_text = _get_diff_display(diff)
+    bg, fg, symbol = _CATEGORY_STYLES.get(category, ("#e2e3e5", "#383d41", "?"))
+
+    parts = []
+    parts.append(f'<div style="background:{bg};color:{fg};padding:1px 4px;margin:1px 0;">')
+
+    if category == "mismatch" and ref_text and cand_text:
+        parts.append(f'<b>{_escape(symbol)}</b> ref: &quot;{_escape(_truncate(ref_text))}&quot;')
+        parts.append(f'<br/>&nbsp;&nbsp;cand: &quot;{_escape(_truncate(cand_text))}&quot;')
+    elif category == "missing" and ref_text:
+        parts.append(f'<b>{_escape(symbol)}</b> &quot;{_escape(_truncate(ref_text))}&quot;')
+    elif category == "extra" and cand_text:
+        parts.append(f'<b>{_escape(symbol)}</b> &quot;{_escape(_truncate(cand_text))}&quot;')
+    elif category == "geometry":
+        deltas_str = ""
+        if hasattr(diff, 'deltas') and diff.deltas:
+            deltas_str = " (" + ", ".join(f"{k}={v:.3f}" for k, v in diff.deltas.items()) + ")"
+        text_display = ref_text or cand_text or ""
+        parts.append(f'<b>{_escape(symbol)}</b> &quot;{_escape(_truncate(text_display))}&quot;{_escape(deltas_str)}')
+    else:
+        parts.append(f'<b>{_escape(symbol)}</b> {_escape(_truncate(message))}')
+
+    parts.append('</div>')
+    return "".join(parts)
+
+
+def _render_diff_plain(diff: Any) -> str:
+    """Render a single difference as plain text."""
+    category, message, ref_text, cand_text = _get_diff_display(diff)
+    _, _, symbol = _CATEGORY_STYLES.get(category, ("", "", "?"))
+
+    if category == "mismatch" and ref_text and cand_text:
+        return f'  {symbol} ref: "{_truncate(ref_text)}"\n    cand: "{_truncate(cand_text)}"'
+    elif category == "missing" and ref_text:
+        return f'  {symbol} "{_truncate(ref_text)}"'
+    elif category == "extra" and cand_text:
+        return f'  {symbol} "{_truncate(cand_text)}"'
+    else:
+        return f'  {symbol} {_truncate(message)}'
+
+
+def _collect_all_diffs(result: StructureComparisonResult) -> List[Tuple[Any, str]]:
+    """Collect all differences with location labels for the overview table."""
+    items: List[Tuple[Any, str]] = []
+    for page_num in sorted(result.page_differences.keys()):
+        for d in result.page_differences[page_num]:
+            loc = f"Page {page_num}"
+            if isinstance(d, LineDifference):
+                idx = d.reference_index if d.reference_index is not None else d.candidate_index
+                if idx is not None:
+                    loc += f", line {idx}"
+            items.append((d, loc))
+    for d in result.document_differences:
+        idx = d.ref_index if d.ref_index is not None else d.cand_index
+        loc = f"line {idx}" if idx is not None else "document"
+        items.append((d, loc))
+    if hasattr(result, 'word_differences'):
+        for d in result.word_differences:
+            idx = d.ref_start_index if d.ref_start_index is not None else d.cand_start_index
+            loc = f"word {idx}" if idx is not None else "document"
+            items.append((d, loc))
+    return items
+
+
+def _get_diff_index(diff: Any) -> int:
+    """Extract the primary positional index from a difference object."""
+    if isinstance(diff, LineDifference):
+        idx = diff.reference_index if diff.reference_index is not None else diff.candidate_index
+        return idx if idx is not None else 999999
+    elif isinstance(diff, DocumentTextDifference):
+        idx = diff.ref_index if diff.ref_index is not None else diff.cand_index
+        return idx if idx is not None else 999999
+    elif isinstance(diff, DocumentWordDifference):
+        idx = diff.ref_start_index if diff.ref_start_index is not None else diff.cand_start_index
+        return idx if idx is not None else 999999
+    return 999999
+
+
+def _group_into_hunks(
+    differences: Sequence[Any],
+    context_lines: int,
+    source_texts: Optional[List[str]] = None,
+) -> List[dict]:
+    """Group contiguous differences into hunks with context.
+
+    Returns list of dicts: {start_index, end_index, differences, context_before, context_after}
+    """
+    if not differences:
+        return []
+
+    sorted_diffs = sorted(differences, key=_get_diff_index)
+    merge_threshold = 2 * context_lines + 1
+
+    hunks = []
+    current_diffs = [sorted_diffs[0]]
+    current_start = _get_diff_index(sorted_diffs[0])
+    current_end = current_start
+
+    for diff in sorted_diffs[1:]:
+        idx = _get_diff_index(diff)
+        if idx - current_end <= merge_threshold:
+            current_diffs.append(diff)
+            current_end = max(current_end, idx)
+        else:
+            # Finalize current hunk
+            ctx_before = []
+            ctx_after = []
+            if source_texts:
+                start = max(0, current_start - context_lines)
+                ctx_before = source_texts[start:current_start]
+                end_pos = min(len(source_texts), current_end + context_lines + 1)
+                ctx_after = source_texts[current_end + 1:end_pos]
+            hunks.append({
+                "start_index": current_start,
+                "end_index": current_end,
+                "differences": current_diffs,
+                "context_before": ctx_before,
+                "context_after": ctx_after,
+            })
+            current_diffs = [diff]
+            current_start = idx
+            current_end = idx
+
+    # Finalize last hunk
+    ctx_before = []
+    ctx_after = []
+    if source_texts:
+        start = max(0, current_start - context_lines)
+        ctx_before = source_texts[start:current_start]
+        end_pos = min(len(source_texts), current_end + context_lines + 1)
+        ctx_after = source_texts[current_end + 1:end_pos]
+    hunks.append({
+        "start_index": current_start,
+        "end_index": current_end,
+        "differences": current_diffs,
+        "context_before": ctx_before,
+        "context_after": ctx_after,
+    })
+
+    return hunks
+
+
+def build_structure_report(
+    result: StructureComparisonResult,
+    *,
+    metadata: Optional[ReportMetadata] = None,
+    context_lines: int = DEFAULT_CONTEXT_LINES,
+    reference_texts: Optional[List[str]] = None,
+    candidate_texts: Optional[List[str]] = None,
+) -> str:
+    """Build a consolidated HTML report from a structure comparison result.
+
+    Returns an HTML string suitable for logger.info(msg, html=True).
+    Returns empty string if result.passed is True.
+    """
+    if result.passed:
+        return ""
+
+    summary = _compute_summary(result)
+    parts = []
+
+    # Outer container — explicit bg+color so report is self-contained in both light/dark mode
+    parts.append('<div style="font-family:monospace;font-size:12px;border:1px solid #adb5bd;'
+                 'border-radius:4px;margin:4px 0;max-width:100%;overflow-x:auto;'
+                 'background:#fff;color:#212529;">')
+
+    # Title
+    parts.append('<div style="background:#343a40;color:#fff;padding:8px 12px;border-bottom:1px solid #adb5bd;'
+                 'font-weight:bold;font-size:13px;">PDF Structure Comparison Report</div>')
+
+    # Metadata
+    if metadata:
+        parts.append('<div style="padding:6px 12px;border-bottom:1px solid #dee2e6;font-size:11px;color:#212529;">')
+        parts.append(f'<div style="word-break:break-all;"><b>Reference:</b> {_escape(metadata.reference_name)}</div>')
+        parts.append(f'<div style="word-break:break-all;"><b>Candidate:</b> {_escape(metadata.candidate_name)}</div>')
+        mode_str = _escape(metadata.comparison_mode)
+        page_str = ""
+        if metadata.page_count_ref is not None or metadata.page_count_cand is not None:
+            page_str = f' | <b>Pages:</b> {metadata.page_count_ref or "?"} ref / {metadata.page_count_cand or "?"} cand'
+        parts.append(f'<div><b>Mode:</b> {mode_str}{page_str}</div>')
+        if metadata.exclusions_applied:
+            exc_str = ", ".join(_escape(e) for e in metadata.exclusions_applied)
+            parts.append(f'<div><b>Exclusions:</b> {exc_str}</div>')
+        parts.append('</div>')
+
+    # Summary
+    parts.append('<div style="padding:8px 12px;border-bottom:1px solid #adb5bd;background:#f8f9fa;color:#212529;">')
+    parts.append(f'<div><b>{summary.total_differences}</b> difference(s)</div>')
+    parts.append('<div style="margin-top:4px;">')
+    if summary.missing_count:
+        parts.append(f'<span style="background:#f8d7da;color:#721c24;padding:2px 6px;border-radius:2px;margin-right:4px;">{summary.missing_count} missing</span>')
+    if summary.extra_count:
+        parts.append(f'<span style="background:#d4edda;color:#155724;padding:2px 6px;border-radius:2px;margin-right:4px;">{summary.extra_count} extra</span>')
+    if summary.mismatch_count:
+        parts.append(f'<span style="background:#fff3cd;color:#856404;padding:2px 6px;border-radius:2px;margin-right:4px;">{summary.mismatch_count} mismatch</span>')
+    if summary.geometry_count:
+        parts.append(f'<span style="background:#e2e3e5;color:#383d41;padding:2px 6px;border-radius:2px;margin-right:4px;">{summary.geometry_count} geometry</span>')
+    if summary.other_count:
+        parts.append(f'<span style="background:#e2e3e5;color:#383d41;padding:2px 6px;border-radius:2px;">{summary.other_count} other</span>')
+    parts.append('</div></div>')
+
+    # Differences overview table
+    all_diffs_for_table = _collect_all_diffs(result)
+    if all_diffs_for_table:
+        parts.append('<div style="padding:8px 12px;border-bottom:1px solid #adb5bd;">')
+        parts.append('<table style="width:100%;border-collapse:collapse;font-size:11px;">')
+        parts.append('<tr style="background:#495057;color:#fff;text-align:left;">'
+                     '<th style="padding:4px 6px;border:1px solid #6c757d;">#</th>'
+                     '<th style="padding:4px 6px;border:1px solid #6c757d;">Type</th>'
+                     '<th style="padding:4px 6px;border:1px solid #6c757d;">Reference</th>'
+                     '<th style="padding:4px 6px;border:1px solid #6c757d;">Candidate</th>'
+                     '<th style="padding:4px 6px;border:1px solid #6c757d;">Location</th></tr>')
+        for row_idx, (diff, location) in enumerate(all_diffs_for_table, 1):
+            category, _, ref_text, cand_text = _get_diff_display(diff)
+            bg, fg, symbol = _CATEGORY_STYLES.get(category, ("#e2e3e5", "#383d41", "?"))
+            ref_cell = _escape(_truncate(ref_text, 60)) if ref_text else "&mdash;"
+            cand_cell = _escape(_truncate(cand_text, 60)) if cand_text else "&mdash;"
+            parts.append(
+                f'<tr style="background:{bg};color:{fg};">'
+                f'<td style="padding:3px 6px;border:1px solid #bbb;">{row_idx}</td>'
+                f'<td style="padding:3px 6px;border:1px solid #bbb;">{_escape(symbol)} {_escape(category)}</td>'
+                f'<td style="padding:3px 6px;border:1px solid #bbb;word-break:break-all;">{ref_cell}</td>'
+                f'<td style="padding:3px 6px;border:1px solid #bbb;word-break:break-all;">{cand_cell}</td>'
+                f'<td style="padding:3px 6px;border:1px solid #bbb;">{_escape(location)}</td></tr>')
+        parts.append('</table></div>')
+
+    # Content sections (hunk detail)
+    parts.append('<div style="padding:4px 12px;">')
+
+    total_hunks = 0
+
+    # Page-level differences
+    if result.page_differences:
+        for page_num in sorted(result.page_differences.keys()):
+            diffs = result.page_differences[page_num]
+            hunks = _group_into_hunks(diffs, context_lines, reference_texts)
+            total_hunks += len(hunks)
+            parts.append(f'<div style="font-weight:bold;margin:8px 0 4px;border-bottom:1px solid #dee2e6;padding-bottom:4px;color:#212529;">'
+                         f'Page {page_num} &mdash; {len(hunks)} hunk(s), {len(diffs)} difference(s)</div>')
+            for i, hunk in enumerate(hunks):
+                if total_hunks > MAX_HUNKS_BEFORE_COLLAPSE and i > 0:
+                    parts.append(f'<div style="color:#6c757d;font-style:italic;margin:4px 0;">... and more hunks (showing first {MAX_HUNKS_BEFORE_COLLAPSE})</div>')
+                    break
+                _render_hunk_to_parts(parts, hunk, i + 1, index_label="line")
+
+    # Document-level differences
+    if result.document_differences:
+        hunks = _group_into_hunks(result.document_differences, context_lines, reference_texts)
+        total_hunks += len(hunks)
+        parts.append(f'<div style="font-weight:bold;margin:8px 0 4px;border-bottom:1px solid #dee2e6;padding-bottom:4px;color:#212529;">'
+                     f'Document (text-only) &mdash; {len(hunks)} hunk(s), {len(result.document_differences)} difference(s)</div>')
+        for i, hunk in enumerate(hunks):
+            if total_hunks > MAX_HUNKS_BEFORE_COLLAPSE:
+                remaining = len(hunks) - i
+                parts.append(f'<div style="color:#6c757d;font-style:italic;margin:4px 0;">... {remaining} more hunk(s) not shown</div>')
+                break
+            _render_hunk_to_parts(parts, hunk, i + 1, index_label="line")
+
+    # Word-level differences
+    if hasattr(result, 'word_differences') and result.word_differences:
+        hunks = _group_into_hunks(result.word_differences, context_lines, reference_texts)
+        total_hunks += len(hunks)
+        parts.append(f'<div style="font-weight:bold;margin:8px 0 4px;border-bottom:1px solid #dee2e6;padding-bottom:4px;color:#212529;">'
+                     f'Document (word-level) &mdash; {len(hunks)} hunk(s), {len(result.word_differences)} difference(s)</div>')
+        rendered = 0
+        for i, hunk in enumerate(hunks):
+            if rendered >= MAX_HUNKS_BEFORE_COLLAPSE:
+                remaining = len(hunks) - rendered
+                parts.append(f'<div style="color:#6c757d;font-style:italic;margin:4px 0;">... {remaining} more hunk(s) not shown</div>')
+                break
+            _render_hunk_to_parts(parts, hunk, i + 1, index_label="word")
+            rendered += 1
+
+    # Summary line
+    if result.summary:
+        parts.append('<div style="margin-top:8px;padding-top:4px;border-top:1px solid #dee2e6;color:#495057;font-size:11px;">')
+        for entry in result.summary:
+            parts.append(f'<div>{_escape(str(entry))}</div>')
+        parts.append('</div>')
+
+    parts.append('</div>')  # close content
+    parts.append('</div>')  # close outer container
+
+    summary.hunk_count = total_hunks
+    return "\n".join(parts)
+
+
+def _render_hunk_to_parts(parts: List[str], hunk: dict, hunk_number: int, index_label: str = "line") -> None:
+    """Render a hunk into the HTML parts list."""
+    start = hunk["start_index"]
+    end = hunk["end_index"]
+    if start == end:
+        label = f"{index_label} {start}"
+    else:
+        label = f"{index_label}s {start}&ndash;{end}"
+
+    parts.append(f'<div style="margin:4px 0;padding:4px 8px;border-left:3px solid #6c757d;background:#f8f9fa;">')
+    parts.append(f'<div style="font-size:10px;color:#6c757d;margin-bottom:2px;">Hunk {hunk_number} ({label})</div>')
+
+    # Context before
+    if hunk["context_before"]:
+        ctx = " | ".join(_truncate(t, 40) for t in hunk["context_before"])
+        parts.append(f'<div style="color:#6c757d;font-size:11px;padding:1px 0;">... {_escape(ctx)} ...</div>')
+
+    # Differences
+    for diff in hunk["differences"]:
+        parts.append(_render_diff_html(diff))
+
+    # Context after
+    if hunk["context_after"]:
+        ctx = " | ".join(_truncate(t, 40) for t in hunk["context_after"])
+        parts.append(f'<div style="color:#6c757d;font-size:11px;padding:1px 0;">... {_escape(ctx)} ...</div>')
+
+    parts.append('</div>')
+
+
+def build_structure_report_plain_text(
+    result: StructureComparisonResult,
+    *,
+    metadata: Optional[ReportMetadata] = None,
+    context_lines: int = DEFAULT_CONTEXT_LINES,
+    reference_texts: Optional[List[str]] = None,
+    candidate_texts: Optional[List[str]] = None,
+) -> str:
+    """Build a plain-text version of the consolidated report.
+
+    Returns empty string if result.passed is True.
+    """
+    if result.passed:
+        return ""
+
+    summary = _compute_summary(result)
+    lines = []
+
+    lines.append("=" * 60)
+    lines.append("PDF Structure Comparison Report")
+    lines.append("=" * 60)
+
+    if metadata:
+        lines.append(f"Reference: {metadata.reference_name}")
+        lines.append(f"Candidate: {metadata.candidate_name}")
+        lines.append(f"Mode: {metadata.comparison_mode}")
+        if metadata.exclusions_applied:
+            lines.append(f"Exclusions: {', '.join(metadata.exclusions_applied)}")
+
+    lines.append("-" * 60)
+    lines.append(f"{summary.total_differences} difference(s): "
+                 f"{summary.missing_count} missing, {summary.extra_count} extra, "
+                 f"{summary.mismatch_count} mismatch, {summary.geometry_count} geometry, "
+                 f"{summary.other_count} other")
+    lines.append("-" * 60)
+
+    # Page-level
+    if result.page_differences:
+        for page_num in sorted(result.page_differences.keys()):
+            diffs = result.page_differences[page_num]
+            hunks = _group_into_hunks(diffs, context_lines, reference_texts)
+            lines.append(f"\nPage {page_num} -- {len(hunks)} hunk(s), {len(diffs)} difference(s)")
+            for i, hunk in enumerate(hunks):
+                if i >= MAX_HUNKS_BEFORE_COLLAPSE:
+                    lines.append(f"  ... {len(hunks) - i} more hunk(s) not shown")
+                    break
+                _render_hunk_plain(lines, hunk, i + 1, index_label="line")
+
+    # Document-level
+    if result.document_differences:
+        hunks = _group_into_hunks(result.document_differences, context_lines, reference_texts)
+        lines.append(f"\nDocument (text-only) -- {len(hunks)} hunk(s), {len(result.document_differences)} difference(s)")
+        for i, hunk in enumerate(hunks):
+            if i >= MAX_HUNKS_BEFORE_COLLAPSE:
+                lines.append(f"  ... {len(hunks) - i} more hunk(s) not shown")
+                break
+            _render_hunk_plain(lines, hunk, i + 1, index_label="line")
+
+    # Word-level
+    if hasattr(result, 'word_differences') and result.word_differences:
+        hunks = _group_into_hunks(result.word_differences, context_lines, reference_texts)
+        lines.append(f"\nDocument (word-level) -- {len(hunks)} hunk(s), {len(result.word_differences)} difference(s)")
+        for i, hunk in enumerate(hunks):
+            if i >= MAX_HUNKS_BEFORE_COLLAPSE:
+                lines.append(f"  ... {len(hunks) - i} more hunk(s) not shown")
+                break
+            _render_hunk_plain(lines, hunk, i + 1, index_label="word")
+
+    if result.summary:
+        lines.append("")
+        for entry in result.summary:
+            lines.append(f"Note: {entry}")
+
+    lines.append("=" * 60)
+    return "\n".join(lines)
+
+
+def _render_hunk_plain(lines: List[str], hunk: dict, hunk_number: int, index_label: str = "line") -> None:
+    """Render a hunk into the plain text lines list."""
+    start = hunk["start_index"]
+    end = hunk["end_index"]
+    if start == end:
+        label = f"{index_label} {start}"
+    else:
+        label = f"{index_label}s {start}-{end}"
+    lines.append(f"  Hunk {hunk_number} ({label})")
+
+    if hunk["context_before"]:
+        ctx = " | ".join(_truncate(t, 40) for t in hunk["context_before"])
+        lines.append(f"  ... {ctx} ...")
+
+    for diff in hunk["differences"]:
+        lines.append(_render_diff_plain(diff))
+
+    if hunk["context_after"]:
+        ctx = " | ".join(_truncate(t, 40) for t in hunk["context_after"])
+        lines.append(f"  ... {ctx} ...")
diff --git a/DocTest/TextNormalization.py b/DocTest/TextNormalization.py
index 0153841..f8c9edf 100644
--- a/DocTest/TextNormalization.py
+++ b/DocTest/TextNormalization.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Dict, Optional
+from typing import Dict, List, Optional, Set, Tuple
 
 
 _LIGATURE_MAP: Dict[str, str] = {
@@ -22,6 +22,71 @@ def normalize_ligatures(text: str) -> str:
     return "".join(_LIGATURE_MAP.get(char, char) for char in text)
 
 
+_WORD_BOUNDARY_CONNECTORS: Set[str] = frozenset("/\\-")
+
+
+def merge_split_words(
+    words: List[str],
+    tokens: "List[WordToken]",
+    connectors: Set[str] | None = None,
+) -> "Tuple[List[str], List[WordToken]]":
+    """Merge word tokens that were split across PDF line boundaries.
+
+    When text reflows across lines in a PDF, words containing connector
+    characters (like ``/``, ``-``, ``\\``) can be split into separate tokens.
+    For example, ``JS2_D48/F16/H8`` may become ``["JS2_D48/F16/", "H8"]``
+    when the line break falls after the ``/``.
+
+    This function detects such splits by looking for tokens from consecutive
+    lines where the preceding token ends with a connector character, and
+    merges them back into a single token.
+
+    Args:
+        words: Flat list of word strings.
+        tokens: Corresponding WordToken provenance objects.
+        connectors: Set of characters that indicate a word was split.
+            Defaults to ``_WORD_BOUNDARY_CONNECTORS`` (``/``, ``\\``, ``-``).
+
+    Returns:
+        Tuple of (merged_words, merged_tokens) with reduced length.
+    """
+    if not words or len(words) <= 1:
+        return list(words), list(tokens)
+
+    if connectors is None:
+        connectors = _WORD_BOUNDARY_CONNECTORS
+
+    merged_words: List[str] = [words[0]]
+    merged_tokens: List[tokens[0].__class__] = [tokens[0]]
+
+    for i in range(1, len(words)):
+        prev_token = merged_tokens[-1]
+        curr_token = tokens[i]
+        prev_word = merged_words[-1]
+
+        # Only merge if tokens are from different lines AND previous word ends with connector.
+        # Skip standalone connectors (e.g. a bare "-" used as punctuation, not a split word).
+        if (prev_token.source_line_index != curr_token.source_line_index
+                and prev_word
+                and prev_word[-1] in connectors
+                and len(prev_word) > 1):
+            # Merge: concatenate words, keep first token's provenance
+            merged_words[-1] = prev_word + words[i]
+            # Update token with merged text
+            from DocTest.PdfStructureModels import WordToken
+            merged_tokens[-1] = WordToken(
+                text=merged_words[-1],
+                source_page=prev_token.source_page,
+                source_line_index=prev_token.source_line_index,
+                word_index=prev_token.word_index,
+            )
+        else:
+            merged_words.append(words[i])
+            merged_tokens.append(tokens[i])
+
+    return merged_words, merged_tokens
+
+
 def apply_character_replacements(
     text: str,
     replacements: Optional[Dict[str, str]] = None,
diff --git a/utest/test_compare_document_words.py b/utest/test_compare_document_words.py
new file mode 100644
index 0000000..f579e1a
--- /dev/null
+++ b/utest/test_compare_document_words.py
@@ -0,0 +1,268 @@
+"""Unit tests for compare_document_words() -- ADR-001 Word-Level Token Comparison."""
+
+import pytest
+
+from DocTest.PdfStructureComparator import (
+    DocumentWordDifference,
+    StructureComparisonResult,
+    compare_document_words,
+)
+from DocTest.PdfStructureModels import (
+    DocumentStructure,
+    PageStructure,
+    StructureExtractionConfig,
+    TextBlock,
+    TextLine,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helper
+# ---------------------------------------------------------------------------
+
+
+def _make_doc(*page_texts):
+    """Create a DocumentStructure from lists of line texts per page.
+
+    Usage: _make_doc(["line1", "line2"], ["line3"]) creates 2 pages.
+    Each positional argument is a list of line-text strings for one page.
+    All lines are placed in a single block per page.
+    """
+    config = StructureExtractionConfig()
+    pages = []
+    for page_num, lines in enumerate(page_texts):
+        text_lines = []
+        for i, text in enumerate(lines):
+            text_lines.append(
+                TextLine(index=i, text=text, bbox=(0.0, 0.0, 100.0, 10.0))
+            )
+        block = TextBlock(index=0, bbox=(0.0, 0.0, 100.0, 100.0), lines=text_lines)
+        page = PageStructure(
+            page_number=page_num, width=612.0, height=792.0, blocks=[block]
+        )
+        pages.append(page)
+    return DocumentStructure(pages=pages, config=config)
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+def test_identical_content_same_lines():
+    """Same text, same lines -> passed=True, no word_differences."""
+    ref = _make_doc(["the quick brown fox"])
+    cand = _make_doc(["the quick brown fox"])
+    result = compare_document_words(ref, cand)
+    assert result.passed
+    assert result.word_differences == []
+
+
+def test_identical_content_different_lines():
+    """Same words split across different lines -> passed=True.
+
+    This is the KEY test for reflow tolerance: the words are identical,
+    only the line breaks differ.
+    """
+    ref = _make_doc(["the quick brown", "fox jumps"])
+    cand = _make_doc(["the quick", "brown fox jumps"])
+    result = compare_document_words(ref, cand)
+    assert result.passed
+    assert result.word_differences == []
+
+
+def test_identical_content_different_pages():
+    """Same words on different pages -> passed=True."""
+    ref = _make_doc(["hello world"], ["foo bar"])
+    cand = _make_doc(["hello world foo bar"])
+    result = compare_document_words(ref, cand)
+    assert result.passed
+    assert result.word_differences == []
+
+
+def test_single_word_replacement():
+    """'fox' vs 'cat' -> one word_mismatch difference."""
+    ref = _make_doc(["the quick fox"])
+    cand = _make_doc(["the quick cat"])
+    result = compare_document_words(ref, cand)
+    assert not result.passed
+    assert len(result.word_differences) >= 1
+    mismatch_diffs = [
+        d for d in result.word_differences if d.diff_type == "word_mismatch"
+    ]
+    assert len(mismatch_diffs) >= 1
+    diff = mismatch_diffs[0]
+    assert "fox" in diff.ref_words
+    assert "cat" in diff.cand_words
+
+
+def test_single_word_insertion():
+    """Candidate has extra word -> one extra_words difference."""
+    ref = _make_doc(["the fox"])
+    cand = _make_doc(["the quick fox"])
+    result = compare_document_words(ref, cand)
+    assert not result.passed
+    extra_diffs = [
+        d for d in result.word_differences if d.diff_type == "extra_words"
+    ]
+    assert len(extra_diffs) >= 1
+    diff = extra_diffs[0]
+    assert "quick" in diff.cand_words
+
+
+def test_single_word_deletion():
+    """Candidate missing a word -> one missing_words difference."""
+    ref = _make_doc(["the quick fox"])
+    cand = _make_doc(["the fox"])
+    result = compare_document_words(ref, cand)
+    assert not result.passed
+    missing_diffs = [
+        d for d in result.word_differences if d.diff_type == "missing_words"
+    ]
+    assert len(missing_diffs) >= 1
+    diff = missing_diffs[0]
+    assert "quick" in diff.ref_words
+
+
+def test_multi_word_replacement():
+    """Contiguous block of different words -> one grouped mismatch."""
+    ref = _make_doc(["the quick brown fox"])
+    cand = _make_doc(["the slow red fox"])
+    result = compare_document_words(ref, cand)
+    assert not result.passed
+    mismatch_diffs = [
+        d for d in result.word_differences if d.diff_type == "word_mismatch"
+    ]
+    assert len(mismatch_diffs) >= 1
+    # The replaced block should be grouped into a single diff
+    diff = mismatch_diffs[0]
+    assert diff.ref_words is not None
+    assert diff.cand_words is not None
+    assert "quick" in diff.ref_words
+    assert "brown" in diff.ref_words
+    assert "slow" in diff.cand_words
+    assert "red" in diff.cand_words
+
+
+def test_case_sensitive_default():
+    """'Hello' vs 'hello' -> mismatch when case_sensitive=True (default)."""
+    ref = _make_doc(["Hello World"])
+    cand = _make_doc(["hello World"])
+    result = compare_document_words(ref, cand)
+    assert not result.passed
+    assert len(result.word_differences) >= 1
+
+
+def test_case_insensitive():
+    """'Hello' vs 'hello' -> passed=True when case_sensitive=False."""
+    ref = _make_doc(["Hello WORLD"])
+    cand = _make_doc(["hello world"])
+    result = compare_document_words(ref, cand, case_sensitive=False)
+    assert result.passed
+    assert result.word_differences == []
+
+
+def test_both_empty_documents():
+    """Both empty -> passed=True."""
+    ref = _make_doc()
+    cand = _make_doc()
+    result = compare_document_words(ref, cand)
+    assert result.passed
+    assert result.word_differences == []
+
+
+def test_one_empty_one_not():
+    """One empty, one with text -> differences reported."""
+    ref = _make_doc(["hello world"])
+    cand = _make_doc()
+    result = compare_document_words(ref, cand)
+    assert not result.passed
+    assert len(result.word_differences) >= 1
+
+
+def test_difference_count_includes_word_diffs():
+    """result.difference_count() counts word_differences."""
+    ref = _make_doc(["the quick fox"])
+    cand = _make_doc(["the slow fox"])
+    result = compare_document_words(ref, cand)
+    assert result.difference_count() >= 1
+    assert result.difference_count() >= len(result.word_differences)
+
+
+def test_word_differences_have_correct_indices():
+    """Verify ref_start_index/ref_end_index/cand_start_index/cand_end_index."""
+    ref = _make_doc(["a b c d e"])
+    cand = _make_doc(["a b x d e"])  # 'c' replaced by 'x'
+    result = compare_document_words(ref, cand)
+    assert not result.passed
+    assert len(result.word_differences) >= 1
+
+    diff = result.word_differences[0]
+    # The replaced word 'c' is at index 2 in the reference
+    assert diff.ref_start_index is not None
+    assert diff.ref_end_index is not None
+    assert diff.cand_start_index is not None
+    assert diff.cand_end_index is not None
+    # 'c' is the 3rd word (index 2), so ref range should be [2, 3)
+    assert diff.ref_start_index == 2
+    assert diff.ref_end_index == 3
+    # 'x' is the 3rd word (index 2), so cand range should be [2, 3)
+    assert diff.cand_start_index == 2
+    assert diff.cand_end_index == 3
+
+
+def test_reflow_across_lines_and_pages():
+    """Complex reflow scenario: identical words, different line/page breaks.
+
+    Reference:
+        page 0: ["The quick brown fox", "jumps over the"]
+        page 1: ["lazy dog"]
+
+    Candidate:
+        page 0: ["The quick", "brown fox jumps"]
+        page 1: ["over the lazy dog"]
+
+    Should pass because the word sequence is identical.
+    """
+    ref = _make_doc(
+        ["The quick brown fox", "jumps over the"],
+        ["lazy dog"],
+    )
+    cand = _make_doc(
+        ["The quick", "brown fox jumps"],
+        ["over the lazy dog"],
+    )
+    result = compare_document_words(ref, cand)
+    assert result.passed
+    assert result.word_differences == []
+
+
+def test_result_is_structure_comparison_result():
+    """compare_document_words returns a StructureComparisonResult."""
+    ref = _make_doc(["hello"])
+    cand = _make_doc(["hello"])
+    result = compare_document_words(ref, cand)
+    assert isinstance(result, StructureComparisonResult)
+
+
+def test_word_difference_has_message():
+    """Each DocumentWordDifference has a non-empty message."""
+    ref = _make_doc(["hello world"])
+    cand = _make_doc(["hello earth"])
+    result = compare_document_words(ref, cand)
+    assert not result.passed
+    for diff in result.word_differences:
+        assert isinstance(diff.message, str)
+        assert len(diff.message) > 0
+
+
+def test_empty_ref_nonempty_cand():
+    """Empty reference, non-empty candidate -> extra words reported."""
+    ref = _make_doc()
+    cand = _make_doc(["hello world"])
+    result = compare_document_words(ref, cand)
+    assert not result.passed
+    extra_diffs = [
+        d for d in result.word_differences if d.diff_type == "extra_words"
+    ]
+    assert len(extra_diffs) >= 1
diff --git a/utest/test_flatten_document_words.py b/utest/test_flatten_document_words.py
new file mode 100644
index 0000000..f977d84
--- /dev/null
+++ b/utest/test_flatten_document_words.py
@@ -0,0 +1,164 @@
+"""Unit tests for flatten_document_words() -- ADR-001 Word-Level Token Comparison."""
+
+import pytest
+
+from DocTest.PdfStructureModels import (
+    DocumentStructure,
+    PageStructure,
+    StructureExtractionConfig,
+    TextBlock,
+    TextLine,
+    WordToken,
+    flatten_document_words,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helper
+# ---------------------------------------------------------------------------
+
+
+def _make_doc(*page_texts):
+    """Create a DocumentStructure from lists of line texts per page.
+
+    Usage: _make_doc(["line1", "line2"], ["line3"]) creates 2 pages.
+    Each positional argument is a list of line-text strings for one page.
+    All lines are placed in a single block per page.
+    """
+    config = StructureExtractionConfig()
+    pages = []
+    for page_num, lines in enumerate(page_texts):
+        text_lines = []
+        for i, text in enumerate(lines):
+            text_lines.append(
+                TextLine(index=i, text=text, bbox=(0.0, 0.0, 100.0, 10.0))
+            )
+        block = TextBlock(index=0, bbox=(0.0, 0.0, 100.0, 100.0), lines=text_lines)
+        page = PageStructure(
+            page_number=page_num, width=612.0, height=792.0, blocks=[block]
+        )
+        pages.append(page)
+    return DocumentStructure(pages=pages, config=config)
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+def test_empty_document():
+    """Empty DocumentStructure returns ([], [])."""
+    doc = _make_doc()
+    words, tokens = flatten_document_words(doc)
+    assert words == []
+    assert tokens == []
+
+
+def test_single_line_single_word():
+    """One line 'hello' produces ['hello'] and one WordToken."""
+    doc = _make_doc(["hello"])
+    words, tokens = flatten_document_words(doc)
+    assert words == ["hello"]
+    assert len(tokens) == 1
+    assert tokens[0].text == "hello"
+
+
+def test_single_line_multiple_words():
+    """'hello world' produces ['hello', 'world'] and two WordTokens."""
+    doc = _make_doc(["hello world"])
+    words, tokens = flatten_document_words(doc)
+    assert words == ["hello", "world"]
+    assert len(tokens) == 2
+    assert tokens[0].text == "hello"
+    assert tokens[1].text == "world"
+
+
+def test_multiple_lines():
+    """Two lines 'foo bar' and 'baz' produce ['foo', 'bar', 'baz'] in order."""
+    doc = _make_doc(["foo bar", "baz"])
+    words, tokens = flatten_document_words(doc)
+    assert words == ["foo", "bar", "baz"]
+    assert len(tokens) == 3
+
+
+def test_multiple_pages():
+    """Words from page 0 and page 1 are concatenated in order."""
+    doc = _make_doc(["alpha beta"], ["gamma"])
+    words, tokens = flatten_document_words(doc)
+    assert words == ["alpha", "beta", "gamma"]
+    assert len(tokens) == 3
+
+
+def test_empty_lines_skipped():
+    """Lines with empty text produce no tokens."""
+    doc = _make_doc(["hello", "", "world"])
+    words, tokens = flatten_document_words(doc)
+    assert words == ["hello", "world"]
+    assert len(tokens) == 2
+
+
+def test_whitespace_only_lines_skipped():
+    """Lines with only whitespace produce no tokens (split yields [])."""
+    doc = _make_doc(["hello", "   ", "world"])
+    words, tokens = flatten_document_words(doc)
+    assert words == ["hello", "world"]
+    assert len(tokens) == 2
+
+
+def test_provenance_metadata_correct():
+    """source_page, source_line_index, and word_index are correct across pages."""
+    doc = _make_doc(["a b"], ["c"])
+    words, tokens = flatten_document_words(doc)
+
+    # First page, first line, word 0
+    assert tokens[0].text == "a"
+    assert tokens[0].source_page == 0
+    assert tokens[0].source_line_index == 0
+    assert tokens[0].word_index == 0
+
+    # First page, first line, word 1
+    assert tokens[1].text == "b"
+    assert tokens[1].source_page == 0
+    assert tokens[1].source_line_index == 0
+    assert tokens[1].word_index == 1
+
+    # Second page, first line, word 2
+    assert tokens[2].text == "c"
+    assert tokens[2].source_page == 1
+    assert tokens[2].source_line_index == 1
+    assert tokens[2].word_index == 2
+
+
+def test_multiple_spaces_normalized():
+    """'hello   world' is split to ['hello', 'world'] (str.split normalizes)."""
+    doc = _make_doc(["hello   world"])
+    words, tokens = flatten_document_words(doc)
+    assert words == ["hello", "world"]
+    assert len(tokens) == 2
+
+
+def test_word_index_is_global():
+    """word_index is sequential across all pages, blocks, and lines."""
+    doc = _make_doc(["a b", "c"], ["d e f"])
+    words, tokens = flatten_document_words(doc)
+    assert words == ["a", "b", "c", "d", "e", "f"]
+
+    expected_indices = list(range(6))
+    actual_indices = [t.word_index for t in tokens]
+    assert actual_indices == expected_indices
+
+
+def test_word_token_is_frozen():
+    """WordToken instances are immutable (frozen dataclass)."""
+    token = WordToken(text="hello", source_page=0, source_line_index=0, word_index=0)
+    with pytest.raises(AttributeError):
+        token.text = "changed"
+
+
+def test_words_and_tokens_have_same_length():
+    """The word strings list and tokens list always have the same length."""
+    doc = _make_doc(["the quick brown fox", "jumps over"], ["the lazy dog"])
+    words, tokens = flatten_document_words(doc)
+    assert len(words) == len(tokens)
+    for word, token in zip(words, tokens):
+        assert word == token.text
diff --git a/utest/test_header_footer_detection.py b/utest/test_header_footer_detection.py
new file mode 100644
index 0000000..ec210b3
--- /dev/null
+++ b/utest/test_header_footer_detection.py
@@ -0,0 +1,539 @@
+"""Unit tests for HeaderFooterDetector module (ADR-002).
+
+Tests cover repetition-based detection of headers/footers, stripping of
+detected lines, digit normalization for page numbers, and the convenience
+filter_headers_footers function.
+"""
+
+import pytest
+
+from DocTest.HeaderFooterDetector import (
+    DetectionResult,
+    HeaderFooterConfig,
+    _normalize_for_grouping,
+    detect_repeating_headers_footers,
+    filter_headers_footers,
+    strip_detected_headers_footers,
+)
+from DocTest.PdfStructureModels import (
+    DocumentStructure,
+    PageStructure,
+    StructureExtractionConfig,
+    TextBlock,
+    TextLine,
+    flatten_document_text,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_page(page_number, lines_data, width=612, height=792):
+    """Create a PageStructure from line data.
+
+    Args:
+        page_number: The 1-based page number.
+        lines_data: list of (text, y_top, y_bottom) tuples.
+            Each line gets bbox = (0, y_top, width, y_bottom).
+        width: Page width in PDF points.
+        height: Page height in PDF points.
+
+    Returns:
+        A PageStructure suitable for testing.
+    """
+    text_lines = []
+    for i, (text, y_top, y_bottom) in enumerate(lines_data):
+        text_lines.append(
+            TextLine(
+                index=i,
+                text=text,
+                bbox=(0.0, float(y_top), float(width), float(y_bottom)),
+            )
+        )
+    block = TextBlock(index=0, bbox=(0, 0, width, height), lines=text_lines)
+    return PageStructure(
+        page_number=page_number, width=width, height=height, blocks=[block]
+    )
+
+
+def _make_doc(*pages):
+    """Create a DocumentStructure from PageStructure objects."""
+    config = StructureExtractionConfig()
+    return DocumentStructure(pages=list(pages), config=config)
+
+
+# ---------------------------------------------------------------------------
+# Normalization helper tests
+# ---------------------------------------------------------------------------
+
+
+class TestNormalizeForGrouping:
+    """Tests for the _normalize_for_grouping helper."""
+
+    def test_replaces_single_digit(self):
+        assert _normalize_for_grouping("Page 1") == "Page #"
+
+    def test_replaces_multiple_digit_runs(self):
+        assert _normalize_for_grouping("Page 1 of 5") == "Page # of #"
+
+    def test_no_digits_unchanged(self):
+        assert _normalize_for_grouping("ACME Corp") == "ACME Corp"
+
+    def test_multi_digit_run(self):
+        assert _normalize_for_grouping("2024-01-15") == "#-#-#"
+
+    def test_empty_string(self):
+        assert _normalize_for_grouping("") == ""
+
+    def test_standalone_page_number(self):
+        assert _normalize_for_grouping("42") == "#"
+
+
+# ---------------------------------------------------------------------------
+# Config tests
+# ---------------------------------------------------------------------------
+
+
+class TestHeaderFooterConfig:
+    """Tests for HeaderFooterConfig properties."""
+
+    def test_detection_disabled_when_scan_height_zero(self):
+        """Both scan heights 0 means detection is disabled."""
+        config = HeaderFooterConfig(header_scan_height=0, footer_scan_height=0)
+        assert config.enabled is False
+
+    def test_config_enabled_with_header_only(self):
+        """Detection is enabled when only header_scan_height > 0."""
+        config = HeaderFooterConfig(header_scan_height=50, footer_scan_height=0)
+        assert config.enabled is True
+
+    def test_config_enabled_with_footer_only(self):
+        """Detection is enabled when only footer_scan_height > 0."""
+        config = HeaderFooterConfig(header_scan_height=0, footer_scan_height=50)
+        assert config.enabled is True
+
+    def test_config_enabled_with_both(self):
+        """Detection is enabled when both scan heights > 0."""
+        config = HeaderFooterConfig(header_scan_height=50, footer_scan_height=50)
+        assert config.enabled is True
+
+
+# ---------------------------------------------------------------------------
+# Detection tests
+# ---------------------------------------------------------------------------
+
+
+class TestDetectRepeatingHeadersFooters:
+    """Tests for detect_repeating_headers_footers."""
+
+    def test_disabled_config_returns_empty_result(self):
+        """When config.enabled is False, detection returns empty result."""
+        doc = _make_doc(
+            _make_page(1, [("ACME Corp", 10, 25), ("Body text", 100, 115)]),
+            _make_page(2, [("ACME Corp", 10, 25), ("More text", 100, 115)]),
+        )
+        config = HeaderFooterConfig(header_scan_height=0, footer_scan_height=0)
+        result = detect_repeating_headers_footers(doc, config)
+        assert result.header_keys == frozenset()
+        assert result.footer_keys == frozenset()
+        assert result.has_detections is False
+
+    def test_detects_identical_header_on_all_pages(self):
+        """Identical text in header region on all pages is detected."""
+        doc = _make_doc(
+            _make_page(1, [("ACME Corp", 10, 25), ("Body page 1", 100, 115)]),
+            _make_page(2, [("ACME Corp", 10, 25), ("Body page 2", 100, 115)]),
+            _make_page(3, [("ACME Corp", 10, 25), ("Body page 3", 100, 115)]),
+        )
+        config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+        result = detect_repeating_headers_footers(doc, config)
+        assert "ACME Corp" in result.header_keys
+        assert result.has_detections is True
+
+    def test_does_not_detect_non_repeating_text(self):
+        """Unique text in header region across pages is not detected."""
+        doc = _make_doc(
+            _make_page(1, [("Title A", 10, 25), ("Body 1", 100, 115)]),
+            _make_page(2, [("Title B", 10, 25), ("Body 2", 100, 115)]),
+            _make_page(3, [("Title C", 10, 25), ("Body 3", 100, 115)]),
+        )
+        config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+        result = detect_repeating_headers_footers(doc, config)
+        assert result.header_keys == frozenset()
+        assert result.has_detections is False
+
+    def test_detects_header_with_page_numbers(self):
+        """Page-number variants normalize to the same key and are detected."""
+        doc = _make_doc(
+            _make_page(1, [("Page 1 of 5", 10, 25), ("Body 1", 100, 115)]),
+            _make_page(2, [("Page 2 of 5", 10, 25), ("Body 2", 100, 115)]),
+            _make_page(3, [("Page 3 of 5", 10, 25), ("Body 3", 100, 115)]),
+        )
+        config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+        result = detect_repeating_headers_footers(doc, config)
+        assert "Page # of #" in result.header_keys
+
+    def test_respects_repeat_threshold_below(self):
+        """Text repeating on fewer pages than threshold is not detected."""
+        doc = _make_doc(
+            _make_page(1, [("Header", 10, 25), ("Body 1", 100, 115)]),
+            _make_page(2, [("Header", 10, 25), ("Body 2", 100, 115)]),
+            _make_page(3, [("Header", 10, 25), ("Body 3", 100, 115)]),
+            _make_page(4, [("Unique", 10, 25), ("Body 4", 100, 115)]),
+            _make_page(5, [("Unique2", 10, 25), ("Body 5", 100, 115)]),
+        )
+        config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=4)
+        result = detect_repeating_headers_footers(doc, config)
+        # "Header" only on 3 pages, threshold is 4
+        assert "Header" not in result.header_keys
+
+    def test_respects_repeat_threshold_at_boundary(self):
+        """Text repeating on exactly threshold pages is detected."""
+        doc = _make_doc(
+            _make_page(1, [("Header", 10, 25), ("Body 1", 100, 115)]),
+            _make_page(2, [("Header", 10, 25), ("Body 2", 100, 115)]),
+            _make_page(3, [("Header", 10, 25), ("Body 3", 100, 115)]),
+            _make_page(4, [("Unique", 10, 25), ("Body 4", 100, 115)]),
+            _make_page(5, [("Unique2", 10, 25), ("Body 5", 100, 115)]),
+        )
+        config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=3)
+        result = detect_repeating_headers_footers(doc, config)
+        assert "Header" in result.header_keys
+
+    def test_single_page_no_detection(self):
+        """Single page document never reaches threshold=2."""
+        doc = _make_doc(
+            _make_page(1, [("Header", 10, 25), ("Body", 100, 115)]),
+        )
+        config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+        result = detect_repeating_headers_footers(doc, config)
+        assert result.header_keys == frozenset()
+        assert result.has_detections is False
+
+    def test_footer_detection(self):
+        """Text in footer region repeating across pages is detected."""
+        # Page height = 792, footer_scan_height = 50 -> boundary at 742
+        # Lines at y_bottom=770 are past 742 -> in footer region
+        doc = _make_doc(
+            _make_page(1, [("Body 1", 100, 115), ("Copyright 2024", 755, 770)]),
+            _make_page(2, [("Body 2", 100, 115), ("Copyright 2024", 755, 770)]),
+            _make_page(3, [("Body 3", 100, 115), ("Copyright 2024", 755, 770)]),
+        )
+        config = HeaderFooterConfig(footer_scan_height=50, repeat_threshold=2)
+        result = detect_repeating_headers_footers(doc, config)
+        assert "Copyright #" in result.footer_keys
+        assert result.has_detections is True
+
+    def test_header_and_footer_simultaneously(self):
+        """Both header and footer can be detected independently."""
+        doc = _make_doc(
+            _make_page(1, [("ACME Corp", 10, 25), ("Body 1", 400, 415), ("Page 1", 760, 775)]),
+            _make_page(2, [("ACME Corp", 10, 25), ("Body 2", 400, 415), ("Page 2", 760, 775)]),
+            _make_page(3, [("ACME Corp", 10, 25), ("Body 3", 400, 415), ("Page 3", 760, 775)]),
+        )
+        config = HeaderFooterConfig(
+            header_scan_height=50, footer_scan_height=50, repeat_threshold=2
+        )
+        result = detect_repeating_headers_footers(doc, config)
+        assert "ACME Corp" in result.header_keys
+        assert "Page #" in result.footer_keys
+
+    def test_standalone_page_number_detection(self):
+        """Standalone page numbers like '1', '2', '3' normalize to '#'."""
+        doc = _make_doc(
+            _make_page(1, [("1", 10, 25), ("Body 1", 100, 115)]),
+            _make_page(2, [("2", 10, 25), ("Body 2", 100, 115)]),
+            _make_page(3, [("3", 10, 25), ("Body 3", 100, 115)]),
+        )
+        config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+        result = detect_repeating_headers_footers(doc, config)
+        assert "#" in result.header_keys
+
+    def test_threshold_greater_than_page_count(self):
+        """When threshold exceeds page count, nothing can be detected."""
+        doc = _make_doc(
+            _make_page(1, [("Header", 10, 25), ("Body 1", 100, 115)]),
+            _make_page(2, [("Header", 10, 25), ("Body 2", 100, 115)]),
+            _make_page(3, [("Header", 10, 25), ("Body 3", 100, 115)]),
+        )
+        config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=5)
+        result = detect_repeating_headers_footers(doc, config)
+        assert result.header_keys == frozenset()
+        assert result.has_detections is False
+
+    def test_line_outside_scan_region_not_counted(self):
+        """Text at y > header_scan_height is not counted as a header candidate."""
+        doc = _make_doc(
+            _make_page(1, [("ACME Corp", 60, 75), ("Body 1", 100, 115)]),
+            _make_page(2, [("ACME Corp", 60, 75), ("Body 2", 100, 115)]),
+            _make_page(3, [("ACME Corp", 60, 75), ("Body 3", 100, 115)]),
+        )
+        config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+        result = detect_repeating_headers_footers(doc, config)
+        # y_top=60 >= header_scan_height=50, so not in header region
+        assert "ACME Corp" not in result.header_keys
+
+
+# ---------------------------------------------------------------------------
+# Stripping tests
+# ---------------------------------------------------------------------------
+
+
+class TestStripDetectedHeadersFooters:
+    """Tests for strip_detected_headers_footers."""
+
+    def test_strips_detected_headers_preserves_body(self):
+        """Detected header lines are removed; body lines remain."""
+        doc = _make_doc(
+            _make_page(1, [("ACME Corp", 10, 25), ("Body line 1", 100, 115), ("Body line 2", 200, 215)]),
+            _make_page(2, [("ACME Corp", 10, 25), ("Body line 3", 100, 115)]),
+        )
+        config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+        detection = detect_repeating_headers_footers(doc, config)
+        result = strip_detected_headers_footers(doc, detection, config)
+
+        # All body lines preserved
+        all_texts = flatten_document_text(result)
+        assert "Body line 1" in all_texts
+        assert "Body line 2" in all_texts
+        assert "Body line 3" in all_texts
+        # Header removed
+        assert "ACME Corp" not in all_texts
+
+    def test_body_text_matching_header_not_stripped(self):
+        """Same text in body region is preserved even if it matches a header key."""
+        doc = _make_doc(
+            _make_page(1, [("ACME Corp", 10, 25), ("ACME Corp", 400, 415)]),
+            _make_page(2, [("ACME Corp", 10, 25), ("Other body", 400, 415)]),
+            _make_page(3, [("ACME Corp", 10, 25), ("More body", 400, 415)]),
+        )
+        config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+        detection = detect_repeating_headers_footers(doc, config)
+        result = strip_detected_headers_footers(doc, detection, config)
+
+        # Page 1: header "ACME Corp" at y=10 removed, body "ACME Corp" at y=400 preserved
+        page1_texts = []
+        for block in result.pages[0].blocks:
+            for line in block.lines:
+                page1_texts.append(line.text)
+        assert "ACME Corp" in page1_texts  # The body-region instance survives
+
+    def test_strips_page_number_variants(self):
+        """Different page-number variants sharing the same key are all stripped."""
+        doc = _make_doc(
+            _make_page(1, [("Page 1 of 5", 10, 25), ("Body A", 100, 115)]),
+            _make_page(2, [("Page 2 of 5", 10, 25), ("Body B", 100, 115)]),
+        )
+        config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+        detection = detect_repeating_headers_footers(doc, config)
+        result = strip_detected_headers_footers(doc, detection, config)
+
+        all_texts = flatten_document_text(result)
+        assert "Page 1 of 5" not in all_texts
+        assert "Page 2 of 5" not in all_texts
+        assert "Body A" in all_texts
+        assert "Body B" in all_texts
+
+    def test_re_indexing_after_strip(self):
+        """After stripping, remaining lines have contiguous indices starting at 0."""
+        doc = _make_doc(
+            _make_page(1, [
+                ("Header", 10, 25),
+                ("Line A", 100, 115),
+                ("Line B", 200, 215),
+                ("Line C", 300, 315),
+            ]),
+            _make_page(2, [
+                ("Header", 10, 25),
+                ("Line D", 100, 115),
+            ]),
+        )
+        config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+        detection = detect_repeating_headers_footers(doc, config)
+        result = strip_detected_headers_footers(doc, detection, config)
+
+        # Page 1 should have lines indexed 0, 1, 2
+        page1_indices = [
+            line.index for block in result.pages[0].blocks for line in block.lines
+        ]
+        assert page1_indices == [0, 1, 2]
+
+        # Page 2 should have line indexed 0
+        page2_indices = [
+            line.index for block in result.pages[1].blocks for line in block.lines
+        ]
+        assert page2_indices == [0]
+
+    def test_empty_blocks_removed_after_strip(self):
+        """A block whose only line is a header gets removed entirely."""
+        # Create a page with two blocks: one with only a header, one with body
+        header_line = TextLine(index=0, text="Header", bbox=(0.0, 10.0, 612.0, 25.0))
+        body_line = TextLine(index=1, text="Body text", bbox=(0.0, 100.0, 612.0, 115.0))
+        header_block = TextBlock(index=0, bbox=(0, 0, 612, 30), lines=[header_line])
+        body_block = TextBlock(index=1, bbox=(0, 90, 612, 120), lines=[body_line])
+        page1 = PageStructure(page_number=1, width=612, height=792, blocks=[header_block, body_block])
+
+        header_line2 = TextLine(index=0, text="Header", bbox=(0.0, 10.0, 612.0, 25.0))
+        body_line2 = TextLine(index=1, text="More text", bbox=(0.0, 100.0, 612.0, 115.0))
+        header_block2 = TextBlock(index=0, bbox=(0, 0, 612, 30), lines=[header_line2])
+        body_block2 = TextBlock(index=1, bbox=(0, 90, 612, 120), lines=[body_line2])
+        page2 = PageStructure(page_number=2, width=612, height=792, blocks=[header_block2, body_block2])
+
+        doc = _make_doc(page1, page2)
+        config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+        detection = detect_repeating_headers_footers(doc, config)
+        result = strip_detected_headers_footers(doc, detection, config)
+
+        # Each page should have only 1 block (body_block), header_block removed
+        for page in result.pages:
+            assert len(page.blocks) == 1
+            assert page.blocks[0].lines[0].text != "Header"
+
+    def test_strips_footer_preserves_header_region(self):
+        """Footer stripping does not affect header-region text."""
+        doc = _make_doc(
+            _make_page(1, [("Title", 10, 25), ("Body", 400, 415), ("Footer", 760, 775)]),
+            _make_page(2, [("Title", 10, 25), ("Body 2", 400, 415), ("Footer", 760, 775)]),
+            _make_page(3, [("Title", 10, 25), ("Body 3", 400, 415), ("Footer", 760, 775)]),
+        )
+        # Only detect footer, not header
+        config = HeaderFooterConfig(header_scan_height=0, footer_scan_height=50, repeat_threshold=2)
+        detection = detect_repeating_headers_footers(doc, config)
+        result = strip_detected_headers_footers(doc, detection, config)
+
+        all_texts = flatten_document_text(result)
+        # Footer should be removed
+        assert "Footer" not in all_texts
+        # Header-region text preserved (not scanned as header since header_scan_height=0)
+        assert all_texts.count("Title") == 3
+
+    def test_no_detections_returns_original_structure(self):
+        """When detection has no results, strip returns the original structure."""
+        doc = _make_doc(
+            _make_page(1, [("Unique A", 10, 25), ("Body", 100, 115)]),
+            _make_page(2, [("Unique B", 10, 25), ("Body 2", 100, 115)]),
+        )
+        detection = DetectionResult(header_keys=frozenset(), footer_keys=frozenset())
+        config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+        result = strip_detected_headers_footers(doc, detection, config)
+        assert result is doc  # Identity check: same object returned
+
+
+# ---------------------------------------------------------------------------
+# Convenience function tests
+# ---------------------------------------------------------------------------
+
+
+class TestFilterHeadersFooters:
+    """Tests for the filter_headers_footers convenience function."""
+
+    def test_filter_headers_footers_end_to_end(self):
+        """filter_headers_footers produces same result as detect + strip."""
+        doc = _make_doc(
+            _make_page(1, [("ACME Corp", 10, 25), ("Body 1", 100, 115), ("Page 1", 760, 775)]),
+            _make_page(2, [("ACME Corp", 10, 25), ("Body 2", 100, 115), ("Page 2", 760, 775)]),
+            _make_page(3, [("ACME Corp", 10, 25), ("Body 3", 100, 115), ("Page 3", 760, 775)]),
+        )
+        config = HeaderFooterConfig(
+            header_scan_height=50, footer_scan_height=50, repeat_threshold=2
+        )
+
+        # Manual two-step
+        detection = detect_repeating_headers_footers(doc, config)
+        expected = strip_detected_headers_footers(doc, detection, config)
+
+        # Convenience one-step
+        actual = filter_headers_footers(doc, config)
+
+        # Compare text content
+        assert flatten_document_text(actual) == flatten_document_text(expected)
+
+    def test_filter_disabled_returns_same_object(self):
+        """When config.enabled is False, the exact same object is returned."""
+        doc = _make_doc(
+            _make_page(1, [("Header", 10, 25), ("Body", 100, 115)]),
+            _make_page(2, [("Header", 10, 25), ("Body 2", 100, 115)]),
+        )
+        config = HeaderFooterConfig(header_scan_height=0, footer_scan_height=0)
+        result = filter_headers_footers(doc, config)
+        assert result is doc
+
+
+# ---------------------------------------------------------------------------
+# Key scenario: page without header content preserved
+# ---------------------------------------------------------------------------
+
+
+class TestPageWithoutHeaderContentPreserved:
+    """The key scenario: a page that lacks the repeating header must not
+    have its body text incorrectly removed."""
+
+    def test_page_without_header_content_preserved(self):
+        """Page 2 has 'HEADER' but page 3 starts with different body text at
+        the same y-position. That body text must NOT be removed."""
+        doc = _make_doc(
+            _make_page(1, [
+                ("HEADER", 10, 25),
+                ("Body page 1", 100, 115),
+            ]),
+            _make_page(2, [
+                ("HEADER", 10, 25),
+                ("Body page 2", 100, 115),
+            ]),
+            _make_page(3, [
+                # No header line -- body text starts at y=10, same as header
+                ("Important content", 10, 25),
+                ("Body page 3", 100, 115),
+            ]),
+        )
+        config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+        detection = detect_repeating_headers_footers(doc, config)
+
+        # "HEADER" detected as header key
+        assert "HEADER" in detection.header_keys
+
+        result = strip_detected_headers_footers(doc, detection, config)
+        all_texts = flatten_document_text(result)
+
+        # "HEADER" removed from pages 1 and 2
+        assert "HEADER" not in all_texts
+        # "Important content" on page 3 preserved (different key)
+        assert "Important content" in all_texts
+        # All body text preserved
+        assert "Body page 1" in all_texts
+        assert "Body page 2" in all_texts
+        assert "Body page 3" in all_texts
+
+    def test_page_without_footer_content_preserved(self):
+        """Symmetric case: a page missing the footer has its body text at
+        the bottom preserved."""
+        doc = _make_doc(
+            _make_page(1, [
+                ("Body 1", 100, 115),
+                ("FOOTER", 760, 775),
+            ]),
+            _make_page(2, [
+                ("Body 2", 100, 115),
+                ("FOOTER", 760, 775),
+            ]),
+            _make_page(3, [
+                ("Body 3", 100, 115),
+                # Different text in footer region
+                ("Final remarks", 760, 775),
+            ]),
+        )
+        config = HeaderFooterConfig(footer_scan_height=50, repeat_threshold=2)
+        detection = detect_repeating_headers_footers(doc, config)
+
+        assert "FOOTER" in detection.footer_keys
+
+        result = strip_detected_headers_footers(doc, detection, config)
+        all_texts = flatten_document_text(result)
+
+        assert "FOOTER" not in all_texts
+        assert "Final remarks" in all_texts
+        assert "Body 1" in all_texts
+        assert "Body 2" in all_texts
+        assert "Body 3" in all_texts
diff --git a/utest/test_spatial_word_sorting.py b/utest/test_spatial_word_sorting.py
new file mode 100644
index 0000000..f716a23
--- /dev/null
+++ b/utest/test_spatial_word_sorting.py
@@ -0,0 +1,611 @@
+"""Unit tests for build_page_structure_from_words() and the spatial_word_sorting config flag."""
+
+import pytest
+
+from DocTest.PdfStructureModels import (
+    DocumentStructure,
+    PageStructure,
+    StructureExtractionConfig,
+    TextBlock,
+    TextLine,
+    build_page_structure,
+    build_page_structure_from_words,
+    flatten_document_words,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helper
+# ---------------------------------------------------------------------------
+
+
+def _make_word_tuple(text, x0, y0, x1, y1, block_no=0, line_no=0, word_no=0):
+    """Return a tuple in PyMuPDF ``get_text('words')`` format.
+
+    Format: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
+    """
+    return (x0, y0, x1, y1, text, block_no, line_no, word_no)
+
+
+# ---------------------------------------------------------------------------
+# 1. Empty / None inputs
+# ---------------------------------------------------------------------------
+
+
+def test_empty_words_list():
+    """Empty input returns PageStructure with no blocks."""
+    page = build_page_structure_from_words(0, [], page_width=612.0, page_height=792.0)
+    assert isinstance(page, PageStructure)
+    assert page.page_number == 0
+    assert page.blocks == []
+    assert page.width == 612.0
+    assert page.height == 792.0
+
+
+def test_none_words_list():
+    """None input returns PageStructure with no blocks."""
+    page = build_page_structure_from_words(0, None, page_width=612.0, page_height=792.0)
+    assert isinstance(page, PageStructure)
+    assert page.blocks == []
+
+
+# ---------------------------------------------------------------------------
+# 2. Single word
+# ---------------------------------------------------------------------------
+
+
+def test_single_word():
+    """One word produces one block with one line."""
+    words = [_make_word_tuple("hello", 10.0, 100.0, 50.0, 112.0)]
+    page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+    assert len(page.blocks) == 1
+    block = page.blocks[0]
+    assert block.line_count == 1
+    assert block.lines[0].text == "hello"
+    assert len(block.lines[0].spans) == 1
+    assert block.lines[0].spans[0].text == "hello"
+
+
+# ---------------------------------------------------------------------------
+# 3. Single line, multiple words
+# ---------------------------------------------------------------------------
+
+
+def test_single_line_multiple_words():
+    """Multiple words at the same Y position produce one line, sorted by x0."""
+    words = [
+        _make_word_tuple("world", 60.0, 100.0, 110.0, 112.0, word_no=1),
+        _make_word_tuple("hello", 10.0, 100.0, 50.0, 112.0, word_no=0),
+        _make_word_tuple("!", 120.0, 100.0, 130.0, 112.0, word_no=2),
+    ]
+    page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+    assert len(page.blocks) == 1
+    assert page.blocks[0].lines[0].text == "hello world !"
+
+
+# ---------------------------------------------------------------------------
+# 4. Multiple lines
+# ---------------------------------------------------------------------------
+
+
+def test_multiple_lines():
+    """Words at different Y positions produce separate lines sorted top-to-bottom."""
+    words = [
+        # Second line (y ~ 200)
+        _make_word_tuple("second", 10.0, 200.0, 80.0, 212.0),
+        # First line (y ~ 100)
+        _make_word_tuple("first", 10.0, 100.0, 60.0, 112.0),
+        # Third line (y ~ 300)
+        _make_word_tuple("third", 10.0, 300.0, 70.0, 312.0),
+    ]
+    page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+    assert len(page.blocks) == 3
+    texts = [b.lines[0].text for b in page.blocks]
+    assert texts == ["first", "second", "third"]
+
+
+# ---------------------------------------------------------------------------
+# 5. Multi-column layout
+# ---------------------------------------------------------------------------
+
+
+def test_multi_column_layout():
+    """Three columns at the same Y range produce words interleaved by Y row.
+
+    This is the key scenario: words from different columns that share the
+    same vertical position should be grouped into the same line, ordered
+    left-to-right.
+    """
+    # Row 1 (y=100..112): three columns
+    words = [
+        _make_word_tuple("C1R1", 10.0, 100.0, 60.0, 112.0),
+        _make_word_tuple("C2R1", 210.0, 100.0, 260.0, 112.0),
+        _make_word_tuple("C3R1", 410.0, 100.0, 460.0, 112.0),
+        # Row 2 (y=130..142): three columns
+        _make_word_tuple("C1R2", 10.0, 130.0, 60.0, 142.0),
+        _make_word_tuple("C2R2", 210.0, 130.0, 260.0, 142.0),
+        _make_word_tuple("C3R2", 410.0, 130.0, 460.0, 142.0),
+    ]
+    page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+    assert len(page.blocks) == 2
+    assert page.blocks[0].lines[0].text == "C1R1 C2R1 C3R1"
+    assert page.blocks[1].lines[0].text == "C1R2 C2R2 C3R2"
+
+
+# ---------------------------------------------------------------------------
+# 6. Mixed font sizes (adaptive tolerance)
+# ---------------------------------------------------------------------------
+
+
+def test_mixed_font_sizes():
+    """Words with different heights at similar Y are grouped using adaptive tolerance.
+
+    Tolerance is min(min_height, word_height) * 0.5.  Words that are close
+    enough vertically should be merged into one line.
+    """
+    # Two words with different heights but overlapping Y midpoints.
+    # Word A: height 12, midpoint = 106
+    # Word B: height 20, midpoint = 110
+    # min_height = 12, tolerance = 12 * 0.5 = 6.0
+    # |106 - 110| = 4.0 < 6.0 => same line
+    words = [
+        _make_word_tuple("small", 10.0, 100.0, 60.0, 112.0),   # height=12, mid=106
+        _make_word_tuple("big", 70.0, 100.0, 140.0, 120.0),    # height=20, mid=110
+    ]
+    page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+    assert len(page.blocks) == 1
+    assert page.blocks[0].lines[0].text == "small big"
+
+
+def test_mixed_font_sizes_separate_lines():
+    """Words whose midpoints differ more than the adaptive tolerance form separate lines."""
+    # Word A: height 10, midpoint = 105
+    # Word B: height 10, midpoint = 120
+    # tolerance = 10 * 0.5 = 5.0
+    # |105 - 120| = 15.0 > 5.0 => different lines
+    words = [
+        _make_word_tuple("line1", 10.0, 100.0, 60.0, 110.0),   # mid=105
+        _make_word_tuple("line2", 10.0, 115.0, 60.0, 125.0),   # mid=120
+    ]
+    page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+    assert len(page.blocks) == 2
+
+
+# ---------------------------------------------------------------------------
+# 7. Text normalization
+# ---------------------------------------------------------------------------
+
+
+def test_text_normalization_applied():
+    """Whitespace collapsing, ligature normalization, and strip edges all work."""
+    config = StructureExtractionConfig(
+        collapse_whitespace=True,
+        strip_line_edges=True,
+        normalize_ligatures=True,
+    )
+    # "\ufb01" is the fi ligature
+    words = [
+        _make_word_tuple("  hello  ", 10.0, 100.0, 60.0, 112.0),
+        _make_word_tuple("\ufb01nd", 70.0, 100.0, 120.0, 112.0),
+    ]
+    page = build_page_structure_from_words(
+        0, words, config=config, page_width=612.0, page_height=792.0
+    )
+
+    assert len(page.blocks) == 1
+    assert page.blocks[0].lines[0].text == "hello find"
+
+
+# ---------------------------------------------------------------------------
+# 8. Config hash includes spatial_word_sorting
+# ---------------------------------------------------------------------------
+
+
+def test_config_hash_includes_spatial():
+    """Two configs differing only in spatial_word_sorting hash differently."""
+    c1 = StructureExtractionConfig(spatial_word_sorting=False)
+    c2 = StructureExtractionConfig(spatial_word_sorting=True)
+    assert hash(c1) != hash(c2)
+
+
+def test_config_hash_same_when_equal():
+    """Configs with identical settings hash the same."""
+    c1 = StructureExtractionConfig(spatial_word_sorting=True)
+    c2 = StructureExtractionConfig(spatial_word_sorting=True)
+    assert hash(c1) == hash(c2)
+
+
+# ---------------------------------------------------------------------------
+# 9. Page dimensions from explicit args
+# ---------------------------------------------------------------------------
+
+
+def test_page_dimensions_from_args():
+    """page_width and page_height params are used directly."""
+    page = build_page_structure_from_words(
+        0, [], page_width=500.0, page_height=700.0
+    )
+    assert page.width == 500.0
+    assert page.height == 700.0
+
+
+# ---------------------------------------------------------------------------
+# 10. Page dimensions from image_shape + dpi
+# ---------------------------------------------------------------------------
+
+
+def test_page_dimensions_from_image_shape():
+    """When page_width=0, falls back to image_shape + dpi calculation."""
+    # image_shape: (height_px, width_px, channels)
+    # width = 720 * 72 / 72 = 720.0
+    # height = 1080 * 72 / 72 = 1080.0
+    page = build_page_structure_from_words(
+        0,
+        [],
+        page_width=0.0,
+        page_height=0.0,
+        dpi=72,
+        image_shape=(1080, 720, 3),
+    )
+    assert page.width == 720.0
+    assert page.height == 1080.0
+
+
+def test_page_dimensions_from_image_shape_with_higher_dpi():
+    """Verify the DPI scaling formula: page_pt = px * 72 / dpi."""
+    # 1440px wide at 144 DPI => 1440 * 72 / 144 = 720 points
+    page = build_page_structure_from_words(
+        0,
+        [],
+        page_width=0.0,
+        page_height=0.0,
+        dpi=144,
+        image_shape=(2160, 1440, 3),
+    )
+    assert page.width == 720.0
+    assert page.height == 1080.0
+
+
+# ---------------------------------------------------------------------------
+# 11. Drop empty lines
+# ---------------------------------------------------------------------------
+
+
+def test_drop_empty_lines():
+    """Empty words after normalization are dropped when drop_empty_lines=True."""
+    config = StructureExtractionConfig(drop_empty_lines=True, strip_line_edges=True)
+    words = [
+        _make_word_tuple("   ", 10.0, 100.0, 60.0, 112.0),   # becomes empty after strip
+        _make_word_tuple("real", 10.0, 200.0, 60.0, 212.0),
+    ]
+    page = build_page_structure_from_words(
+        0, words, config=config, page_width=612.0, page_height=792.0
+    )
+
+    assert len(page.blocks) == 1
+    assert page.blocks[0].lines[0].text == "real"
+
+
+def test_keep_empty_lines_when_disabled():
+    """When drop_empty_lines=False, whitespace-only words still produce lines."""
+    config = StructureExtractionConfig(
+        drop_empty_lines=False,
+        collapse_whitespace=False,
+        strip_line_edges=False,
+    )
+    words = [
+        _make_word_tuple("   ", 10.0, 100.0, 60.0, 112.0),
+        _make_word_tuple("real", 10.0, 200.0, 60.0, 212.0),
+    ]
+    page = build_page_structure_from_words(
+        0, words, config=config, page_width=612.0, page_height=792.0
+    )
+
+    assert len(page.blocks) == 2
+
+
+# ---------------------------------------------------------------------------
+# 12. Bbox is union of word bboxes
+# ---------------------------------------------------------------------------
+
+
+def test_bbox_is_union_of_word_bboxes():
+    """Line bbox is the union of all word bboxes in that line."""
+    words = [
+        _make_word_tuple("left", 10.0, 100.0, 50.0, 112.0),
+        _make_word_tuple("right", 200.0, 98.0, 260.0, 115.0),
+    ]
+    page = build_page_structure_from_words(
+        0, words, config=StructureExtractionConfig(round_precision=None),
+        page_width=612.0, page_height=792.0,
+    )
+
+    assert len(page.blocks) == 1
+    bbox = page.blocks[0].lines[0].bbox
+    # x0 = min(10.0, 200.0) = 10.0
+    assert bbox[0] == 10.0
+    # y0 = min(100.0, 98.0) = 98.0
+    assert bbox[1] == 98.0
+    # x1 = max(50.0, 260.0) = 260.0
+    assert bbox[2] == 260.0
+    # y1 = max(112.0, 115.0) = 115.0
+    assert bbox[3] == 115.0
+
+
+# ---------------------------------------------------------------------------
+# 13. Round precision applied
+# ---------------------------------------------------------------------------
+
+
+def test_round_precision_applied():
+    """Bboxes are rounded per config.round_precision."""
+    words = [
+        _make_word_tuple("word", 10.12345, 100.6789, 50.99999, 112.11111),
+    ]
+    config = StructureExtractionConfig(round_precision=2)
+    page = build_page_structure_from_words(
+        0, words, config=config, page_width=612.0, page_height=792.0,
+    )
+
+    bbox = page.blocks[0].lines[0].bbox
+    assert bbox == (10.12, 100.68, 51.0, 112.11)
+
+
+def test_round_precision_none_no_rounding():
+    """When round_precision is None, coordinates are not rounded."""
+    words = [
+        _make_word_tuple("word", 10.12345, 100.6789, 50.99999, 112.11111),
+    ]
+    config = StructureExtractionConfig(round_precision=None)
+    page = build_page_structure_from_words(
+        0, words, config=config, page_width=612.0, page_height=792.0,
+    )
+
+    bbox = page.blocks[0].lines[0].bbox
+    assert bbox == (10.12345, 100.6789, 50.99999, 112.11111)
+
+
+# ---------------------------------------------------------------------------
+# 14. Words sorted left to right within a line
+# ---------------------------------------------------------------------------
+
+
+def test_words_sorted_left_to_right_within_line():
+    """Even if words are added out of order, they come out sorted by x0."""
+    words = [
+        _make_word_tuple("C", 200.0, 100.0, 220.0, 112.0),
+        _make_word_tuple("A", 10.0, 100.0, 30.0, 112.0),
+        _make_word_tuple("B", 100.0, 100.0, 120.0, 112.0),
+    ]
+    page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+    assert len(page.blocks) == 1
+    assert page.blocks[0].lines[0].text == "A B C"
+
+
+# ---------------------------------------------------------------------------
+# 15. Spatial vs block: same simple text
+# ---------------------------------------------------------------------------
+
+
+def test_spatial_vs_block_same_simple_text():
+    """For a simple single-column document, build_page_structure and
+    build_page_structure_from_words produce the same word sequence when flattened.
+    """
+    # Simulate a simple PDF dict for build_page_structure
+    pdf_dict = {
+        "width": 612.0,
+        "height": 792.0,
+        "blocks": [
+            {
+                "type": 0,
+                "bbox": (10.0, 100.0, 200.0, 145.0),
+                "lines": [
+                    {
+                        "bbox": (10.0, 100.0, 200.0, 112.0),
+                        "spans": [
+                            {"text": "hello world", "font": "Arial", "size": 12.0}
+                        ],
+                    },
+                    {
+                        "bbox": (10.0, 130.0, 200.0, 142.0),
+                        "spans": [
+                            {"text": "foo bar", "font": "Arial", "size": 12.0}
+                        ],
+                    },
+                ],
+            }
+        ],
+    }
+
+    # Simulate equivalent word tuples for build_page_structure_from_words
+    word_tuples = [
+        _make_word_tuple("hello", 10.0, 100.0, 50.0, 112.0, 0, 0, 0),
+        _make_word_tuple("world", 55.0, 100.0, 100.0, 112.0, 0, 0, 1),
+        _make_word_tuple("foo", 10.0, 130.0, 40.0, 142.0, 0, 1, 0),
+        _make_word_tuple("bar", 45.0, 130.0, 80.0, 142.0, 0, 1, 1),
+    ]
+
+    config = StructureExtractionConfig()
+    page_block = build_page_structure(0, pdf_dict, config=config)
+    page_spatial = build_page_structure_from_words(
+        0, word_tuples, config=config, page_width=612.0, page_height=792.0,
+    )
+
+    # Extract words from both
+    def _extract_words(page):
+        words = []
+        for block in page.blocks:
+            for line in block.lines:
+                words.extend(line.text.split())
+        return words
+
+    block_words = _extract_words(page_block)
+    spatial_words = _extract_words(page_spatial)
+    assert block_words == spatial_words
+
+
+# ---------------------------------------------------------------------------
+# 16. Integration with flatten_document_words
+# ---------------------------------------------------------------------------
+
+
+def test_integration_with_flatten_document_words():
+    """Build a DocumentStructure from spatial pages and verify flatten_document_words works."""
+    words_page1 = [
+        _make_word_tuple("page", 10.0, 100.0, 50.0, 112.0),
+        _make_word_tuple("one", 55.0, 100.0, 90.0, 112.0),
+    ]
+    words_page2 = [
+        _make_word_tuple("page", 10.0, 100.0, 50.0, 112.0),
+        _make_word_tuple("two", 55.0, 100.0, 90.0, 112.0),
+    ]
+
+    config = StructureExtractionConfig()
+    page1 = build_page_structure_from_words(
+        0, words_page1, config=config, page_width=612.0, page_height=792.0,
+    )
+    page2 = build_page_structure_from_words(
+        1, words_page2, config=config, page_width=612.0, page_height=792.0,
+    )
+
+    doc = DocumentStructure(pages=[page1, page2], config=config)
+
+    flat_words, tokens = flatten_document_words(doc)
+    assert flat_words == ["page", "one", "page", "two"]
+    assert len(tokens) == 4
+    assert tokens[0].source_page == 0
+    assert tokens[2].source_page == 1
+    assert tokens[0].word_index == 0
+    assert tokens[3].word_index == 3
+
+
+# ---------------------------------------------------------------------------
+# 17. Character replacements applied
+# ---------------------------------------------------------------------------
+
+
+def test_character_replacements_applied():
+    """Character replacements are applied to word text during normalization."""
+    config = StructureExtractionConfig(
+        character_replacements={"\u00A0": " ", "\u2013": "-"},
+    )
+    # Non-breaking space within a word, en-dash in another
+    words = [
+        _make_word_tuple("hello\u00A0world", 10.0, 100.0, 100.0, 112.0),
+        _make_word_tuple("2020\u20132021", 110.0, 100.0, 200.0, 112.0),
+    ]
+    page = build_page_structure_from_words(
+        0, words, config=config, page_width=612.0, page_height=792.0,
+    )
+
+    assert len(page.blocks) == 1
+    line_text = page.blocks[0].lines[0].text
+    # NBSP replaced with space, then words joined
+    # "hello world" becomes two parts after collapse_whitespace: "hello" "world"
+    # so the full text depends on how the joining works
+    assert "\u00A0" not in line_text
+    assert "\u2013" not in line_text
+    assert "2020-2021" in line_text
+
+
+# ---------------------------------------------------------------------------
+# Additional edge cases
+# ---------------------------------------------------------------------------
+
+
+def test_block_index_and_line_index_increment():
+    """Block index and global line index are sequential."""
+    words = [
+        _make_word_tuple("line1", 10.0, 100.0, 60.0, 112.0),
+        _make_word_tuple("line2", 10.0, 200.0, 60.0, 212.0),
+        _make_word_tuple("line3", 10.0, 300.0, 60.0, 312.0),
+    ]
+    page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+    assert len(page.blocks) == 3
+    for i, block in enumerate(page.blocks):
+        assert block.index == i
+        assert block.lines[0].index == i
+
+
+def test_page_number_is_preserved():
+    """The page_number argument is stored in the result."""
+    page = build_page_structure_from_words(42, [], page_width=612.0, page_height=792.0)
+    assert page.page_number == 42
+
+
+def test_block_bbox_equals_line_bbox():
+    """Since each block has exactly one line, the block bbox should match the line bbox."""
+    words = [
+        _make_word_tuple("hello", 10.0, 100.0, 50.0, 112.0),
+        _make_word_tuple("world", 55.0, 100.0, 100.0, 112.0),
+    ]
+    page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+    assert len(page.blocks) == 1
+    assert page.blocks[0].bbox == page.blocks[0].lines[0].bbox
+
+
+def test_line_count_property():
+    """PageStructure.line_count aggregates across all blocks."""
+    words = [
+        _make_word_tuple("a", 10.0, 100.0, 30.0, 112.0),
+        _make_word_tuple("b", 10.0, 200.0, 30.0, 212.0),
+        _make_word_tuple("c", 10.0, 300.0, 30.0, 312.0),
+    ]
+    page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+    assert page.line_count == 3
+
+
+def test_spans_contain_full_line_text():
+    """Each line has exactly one span whose text matches the line text."""
+    words = [
+        _make_word_tuple("alpha", 10.0, 100.0, 60.0, 112.0),
+        _make_word_tuple("beta", 70.0, 100.0, 120.0, 112.0),
+    ]
+    page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+    line = page.blocks[0].lines[0]
+    assert len(line.spans) == 1
+    assert line.spans[0].text == line.text
+    assert line.spans[0].font is None
+    assert line.spans[0].size == 0.0
+
+
+def test_fonts_set_is_empty():
+    """Spatial word extraction does not have font info, so fonts set is empty."""
+    words = [_make_word_tuple("test", 10.0, 100.0, 50.0, 112.0)]
+    page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+    assert page.blocks[0].lines[0].fonts == set()
+
+
+def test_whitespace_replacement_used():
+    """The whitespace_replacement from config is used to join words."""
+    config = StructureExtractionConfig(whitespace_replacement="|")
+    words = [
+        _make_word_tuple("a", 10.0, 100.0, 30.0, 112.0),
+        _make_word_tuple("b", 40.0, 100.0, 60.0, 112.0),
+    ]
+    page = build_page_structure_from_words(
+        0, words, config=config, page_width=612.0, page_height=792.0,
+    )
+
+    assert page.blocks[0].lines[0].text == "a|b"
+
+
+def test_default_config_used_when_none():
+    """When config is None, a default StructureExtractionConfig is used."""
+    words = [_make_word_tuple("hello", 10.0, 100.0, 50.0, 112.0)]
+    page = build_page_structure_from_words(0, words, config=None, page_width=612.0, page_height=792.0)
+
+    assert len(page.blocks) == 1
+    assert page.blocks[0].lines[0].text == "hello"
diff --git a/utest/test_structure_report.py b/utest/test_structure_report.py
new file mode 100644
index 0000000..0b8ef8e
--- /dev/null
+++ b/utest/test_structure_report.py
@@ -0,0 +1,875 @@
+"""Comprehensive unit tests for DocTest.StructureReportBuilder (ADR-003).
+
+Tests cover:
+  - Passing results returning empty strings
+  - Single difference types (missing, extra, mismatch, geometry)
+  - Hunk grouping (adjacent, separated, merge boundary)
+  - Context lines with/without reference_texts
+  - Summary statistics
+  - Document-level and word-level differences
+  - Text truncation
+  - HTML escaping (XSS safety)
+  - Large results with hunk collapse
+  - Metadata rendering
+  - Plain-text report structure
+  - Internal helpers (_classify_diff_type, _group_into_hunks, _escape, _truncate)
+"""
+
+import pytest
+
+from DocTest.PdfStructureComparator import (
+    DocumentTextDifference,
+    DocumentWordDifference,
+    LineDifference,
+    StructureComparisonResult,
+)
+from DocTest.StructureReportBuilder import (
+    MAX_HUNKS_BEFORE_COLLAPSE,
+    MAX_TEXT_DISPLAY_LENGTH,
+    ReportMetadata,
+    ReportSummary,
+    _classify_diff_type,
+    _collect_all_diffs,
+    _compute_summary,
+    _escape,
+    _group_into_hunks,
+    _truncate,
+    build_structure_report,
+    build_structure_report_plain_text,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_passing_result():
+    """Return a StructureComparisonResult with passed=True."""
+    return StructureComparisonResult()
+
+
+def _make_result_with_page_diffs(diffs, page=1):
+    """Return a failing StructureComparisonResult with the given LineDifferences."""
+    result = StructureComparisonResult()
+    for d in diffs:
+        result.add_difference(d)
+    return result
+
+
+def _make_line_diff(diff_type, *, page=1, ref_text=None, cand_text=None,
+                    deltas=None, reference_index=None, candidate_index=None,
+                    message=None):
+    """Convenience factory for LineDifference."""
+    if message is None:
+        message = f"Synthetic {diff_type}"
+    return LineDifference(
+        page=page,
+        diff_type=diff_type,
+        message=message,
+        ref_text=ref_text,
+        cand_text=cand_text,
+        deltas=deltas,
+        reference_index=reference_index,
+        candidate_index=candidate_index,
+    )
+
+
+# ===========================================================================
+# 1 & 2 - Passing result returns empty string
+# ===========================================================================
+
+
+class TestPassingResult:
+
+    def test_html_report_empty_for_passing_result(self):
+        result = _make_passing_result()
+        assert result.passed is True
+        html = build_structure_report(result)
+        assert html == ""
+
+    def test_plain_report_empty_for_passing_result(self):
+        result = _make_passing_result()
+        plain = build_structure_report_plain_text(result)
+        assert plain == ""
+
+
+# ===========================================================================
+# 3-6 - Single differences
+# ===========================================================================
+
+
+class TestSingleDifferences:
+
+    def test_html_report_single_missing_line(self):
+        diff = _make_line_diff(
+            "missing_line",
+            ref_text="vanished line",
+            reference_index=0,
+        )
+        result = _make_result_with_page_diffs([diff])
+        html = build_structure_report(result)
+
+        assert "#f8d7da" in html, "Missing line should use red background #f8d7da"
+        assert "<b>-</b>" in html, "Missing line should display '-' symbol"
+        assert "vanished line" in html
+
+    def test_html_report_single_extra_line(self):
+        diff = _make_line_diff(
+            "extra_line",
+            cand_text="new line appeared",
+            candidate_index=0,
+        )
+        result = _make_result_with_page_diffs([diff])
+        html = build_structure_report(result)
+
+        assert "#d4edda" in html, "Extra line should use green background #d4edda"
+        assert "<b>+</b>" in html, "Extra line should display '+' symbol"
+        assert "new line appeared" in html
+
+    def test_html_report_single_text_mismatch(self):
+        diff = _make_line_diff(
+            "text_mismatch",
+            ref_text="foo",
+            cand_text="bar",
+            reference_index=0,
+            candidate_index=0,
+        )
+        result = _make_result_with_page_diffs([diff])
+        html = build_structure_report(result)
+
+        assert "#fff3cd" in html, "Text mismatch should use yellow background #fff3cd"
+        assert "ref:" in html, "Text mismatch should show 'ref:' label"
+        assert "cand:" in html, "Text mismatch should show 'cand:' label"
+        assert "foo" in html
+        assert "bar" in html
+
+    def test_html_report_single_geometry_mismatch(self):
+        diff = _make_line_diff(
+            "geometry_mismatch",
+            ref_text="shifted text",
+            deltas={"left": 5.0},
+            reference_index=0,
+        )
+        result = _make_result_with_page_diffs([diff])
+        html = build_structure_report(result)
+
+        assert "#e2e3e5" in html, "Geometry mismatch should use grey background #e2e3e5"
+        # The delta symbol U+0394
+        assert "\u0394" in html or "&#916;" in html or "&Delta;" in html, \
+            "Geometry mismatch should display delta symbol"
+
+
+# ===========================================================================
+# 7-9 - Grouping / Hunks
+# ===========================================================================
+
+
+class TestHunkGrouping:
+
+    def test_adjacent_diffs_grouped_into_one_hunk(self):
+        """5 consecutive LineDifferences at indices 10-14 produce 1 hunk."""
+        diffs = [
+            _make_line_diff("missing_line", ref_text=f"line {i}",
+                            reference_index=i)
+            for i in range(10, 15)
+        ]
+        result = _make_result_with_page_diffs(diffs)
+        html = build_structure_report(result)
+
+        assert "Hunk 1" in html
+        assert "Hunk 2" not in html
+
+    def test_separated_diffs_produce_separate_hunks(self):
+        """Diffs at indices 5 and 50 produce two separate hunks."""
+        diff_a = _make_line_diff("missing_line", ref_text="early",
+                                 reference_index=5)
+        diff_b = _make_line_diff("extra_line", cand_text="late",
+                                 candidate_index=50)
+        result = _make_result_with_page_diffs([diff_a, diff_b])
+        html = build_structure_report(result)
+
+        assert "Hunk 1" in html
+        assert "Hunk 2" in html
+
+    def test_gap_at_merge_boundary(self):
+        """context_lines=3: merge_threshold = 2*3+1 = 7.
+
+        Diffs at index 10 and 17 (gap=7) -> merged into 1 hunk.
+        Diffs at index 10 and 18 (gap=8) -> 2 separate hunks.
+        """
+        # Gap = 7 => 1 hunk
+        d1 = _make_line_diff("missing_line", ref_text="a", reference_index=10)
+        d2 = _make_line_diff("missing_line", ref_text="b", reference_index=17)
+        result_merged = _make_result_with_page_diffs([d1, d2])
+        html_merged = build_structure_report(result_merged, context_lines=3)
+        assert "Hunk 1" in html_merged
+        assert "Hunk 2" not in html_merged
+
+        # Gap = 8 => 2 hunks
+        d3 = _make_line_diff("missing_line", ref_text="a", reference_index=10)
+        d4 = _make_line_diff("missing_line", ref_text="b", reference_index=18)
+        result_split = _make_result_with_page_diffs([d3, d4])
+        html_split = build_structure_report(result_split, context_lines=3)
+        assert "Hunk 1" in html_split
+        assert "Hunk 2" in html_split
+
+
+# ===========================================================================
+# 10-11 - Context
+# ===========================================================================
+
+
+class TestContext:
+
+    def test_context_shown_when_texts_provided(self):
+        """When reference_texts is provided, context words appear in HTML."""
+        ref_texts = [f"word_{i}" for i in range(20)]
+        diff = _make_line_diff("missing_line", ref_text="word_10",
+                               reference_index=10)
+        result = _make_result_with_page_diffs([diff])
+        html = build_structure_report(result, reference_texts=ref_texts,
+                                      context_lines=3)
+
+        # Context before should include words near index 10
+        assert "word_7" in html or "word_8" in html or "word_9" in html, \
+            "Context before the diff should be visible"
+        # Context after
+        assert "word_11" in html or "word_12" in html or "word_13" in html, \
+            "Context after the diff should be visible"
+
+    def test_no_context_when_texts_not_provided(self):
+        """Without reference_texts, no context divs with '...' appear."""
+        diff = _make_line_diff("missing_line", ref_text="gone",
+                               reference_index=10)
+        result = _make_result_with_page_diffs([diff])
+        html = build_structure_report(result, reference_texts=None)
+
+        # The "..." context wrapper should not appear
+        # (the only "..." might come from truncation, but there should be
+        #  no context div with the pattern ...word...)
+        assert "color:#999" not in html
+
+
+# ===========================================================================
+# 12-13 - Summary statistics
+# ===========================================================================
+
+
+class TestSummaryStatistics:
+
+    def test_summary_counts_correct(self):
+        """Mix of diff types yields correct ReportSummary counts."""
+        result = StructureComparisonResult()
+        result.add_difference(_make_line_diff("missing_line", ref_text="a",
+                                              reference_index=0))
+        result.add_difference(_make_line_diff("missing_line", ref_text="b",
+                                              reference_index=1))
+        result.add_difference(_make_line_diff("extra_line", cand_text="c",
+                                              candidate_index=2))
+        result.add_difference(_make_line_diff("text_mismatch", ref_text="d",
+                                              cand_text="e",
+                                              reference_index=3))
+        result.add_difference(_make_line_diff("geometry_mismatch",
+                                              ref_text="f",
+                                              deltas={"left": 1.0},
+                                              reference_index=4))
+
+        summary = _compute_summary(result)
+
+        assert summary.missing_count == 2
+        assert summary.extra_count == 1
+        assert summary.mismatch_count == 1
+        assert summary.geometry_count == 1
+        assert summary.other_count == 0
+        assert summary.total_differences == 5
+
+    def test_summary_includes_word_diffs(self):
+        """Word differences are counted in summary statistics."""
+        result = StructureComparisonResult()
+        result.add_word_difference(DocumentWordDifference(
+            diff_type="missing_words",
+            message="words gone",
+            ref_words=["hello"],
+            ref_start_index=0,
+            ref_end_index=1,
+        ))
+        result.add_word_difference(DocumentWordDifference(
+            diff_type="extra_words",
+            message="words added",
+            cand_words=["world"],
+            cand_start_index=0,
+            cand_end_index=1,
+        ))
+        result.add_word_difference(DocumentWordDifference(
+            diff_type="word_mismatch",
+            message="words changed",
+            ref_words=["old"],
+            cand_words=["new"],
+            ref_start_index=5,
+            ref_end_index=6,
+            cand_start_index=5,
+            cand_end_index=6,
+        ))
+
+        summary = _compute_summary(result)
+
+        assert summary.missing_count == 1
+        assert summary.extra_count == 1
+        assert summary.mismatch_count == 1
+        assert summary.total_differences == 3
+
+
+# ===========================================================================
+# 14-15 - Document-level and word-level
+# ===========================================================================
+
+
+class TestDocumentAndWordLevel:
+
+    def test_document_level_diffs_in_report(self):
+        """DocumentTextDifference items produce 'Document (text-only)' section."""
+        result = StructureComparisonResult()
+        result.add_document_difference(DocumentTextDifference(
+            diff_type="missing_text",
+            message="Text missing: hello",
+            ref_text="hello",
+            ref_index=0,
+        ))
+
+        html = build_structure_report(result)
+        assert "Document (text-only)" in html
+
+    def test_word_level_diffs_in_report(self):
+        """DocumentWordDifference items produce 'Document (word-level)' section."""
+        result = StructureComparisonResult()
+        result.add_word_difference(DocumentWordDifference(
+            diff_type="word_mismatch",
+            message="Word changed",
+            ref_words=["alpha"],
+            cand_words=["beta"],
+            ref_start_index=0,
+            ref_end_index=1,
+            cand_start_index=0,
+            cand_end_index=1,
+        ))
+
+        html = build_structure_report(result)
+        assert "Document (word-level)" in html
+
+
+# ===========================================================================
+# 16 - Truncation
+# ===========================================================================
+
+
+class TestTruncation:
+
+    def test_long_text_truncated(self):
+        """Diff with 500-char ref_text is truncated in HTML output."""
+        long_text = "x" * 500
+        diff = _make_line_diff("missing_line", ref_text=long_text,
+                               reference_index=0)
+        result = _make_result_with_page_diffs([diff])
+        html = build_structure_report(result)
+
+        # The full 500-char text should NOT appear in the report
+        assert long_text not in html
+        # Instead the truncated version with "..." should
+        assert "..." in html
+        # The output should contain at most MAX_TEXT_DISPLAY_LENGTH chars
+        # of the original text (minus 3 for "...")
+        truncated = long_text[:MAX_TEXT_DISPLAY_LENGTH - 3] + "..."
+        assert _escape(truncated) in html
+
+
+# ===========================================================================
+# 17 - HTML safety
+# ===========================================================================
+
+
+class TestHTMLSafety:
+
+    def test_html_special_chars_escaped(self):
+        """XSS payload in diff text is escaped, not rendered raw."""
+        xss = "<script>alert('xss')</script>"
+        diff = _make_line_diff("missing_line", ref_text=xss,
+                               reference_index=0)
+        result = _make_result_with_page_diffs([diff])
+        html = build_structure_report(result)
+
+        assert "<script>" not in html, "Raw <script> tag must not appear"
+        assert "&lt;script&gt;" in html, "Escaped script tag should appear"
+
+
+# ===========================================================================
+# 18 - Large results
+# ===========================================================================
+
+
+class TestLargeResults:
+
+    def test_many_hunks_collapsed(self):
+        """60+ differences at widely separated indices trigger collapse notice."""
+        # Create diffs at indices 0, 1000, 2000, ..., separated enough to be
+        # distinct hunks.  We need > MAX_HUNKS_BEFORE_COLLAPSE hunks.
+        count = MAX_HUNKS_BEFORE_COLLAPSE + 10
+        diffs = [
+            _make_line_diff("missing_line", ref_text=f"line_{i}",
+                            reference_index=i * 1000)
+            for i in range(count)
+        ]
+        result = _make_result_with_page_diffs(diffs)
+        html = build_structure_report(result)
+
+        assert "more hunk" in html.lower(), \
+            "Report should mention collapsed hunks when count exceeds limit"
+
+
+# ===========================================================================
+# 19-21 - Metadata
+# ===========================================================================
+
+
+class TestMetadata:
+
+    def test_metadata_in_html_report(self):
+        """Provided ReportMetadata appears in the HTML report."""
+        meta = ReportMetadata(
+            reference_name="ref.pdf",
+            candidate_name="cand.pdf",
+            comparison_mode="structure",
+        )
+        diff = _make_line_diff("missing_line", ref_text="a",
+                               reference_index=0)
+        result = _make_result_with_page_diffs([diff])
+        html = build_structure_report(result, metadata=meta)
+
+        assert "ref.pdf" in html
+        assert "cand.pdf" in html
+        assert "structure" in html
+
+    def test_metadata_with_exclusions(self):
+        """ReportMetadata with exclusions lists them in the report."""
+        meta = ReportMetadata(
+            reference_name="r.pdf",
+            candidate_name="c.pdf",
+            comparison_mode="full",
+            exclusions_applied=["header", "footer"],
+        )
+        diff = _make_line_diff("extra_line", cand_text="x",
+                               candidate_index=0)
+        result = _make_result_with_page_diffs([diff])
+        html = build_structure_report(result, metadata=meta)
+
+        assert "header" in html
+        assert "footer" in html
+        assert "Exclusions" in html
+
+    def test_no_metadata_still_works(self):
+        """metadata=None does not cause errors and still generates a report."""
+        diff = _make_line_diff("missing_line", ref_text="a",
+                               reference_index=0)
+        result = _make_result_with_page_diffs([diff])
+        html = build_structure_report(result, metadata=None)
+
+        assert len(html) > 0
+        assert "PDF Structure Comparison Report" in html
+
+
+# ===========================================================================
+# 22-23 - Plain text report
+# ===========================================================================
+
+
+class TestPlainTextReport:
+
+    def test_plain_text_has_structure(self):
+        """Plain text report contains section delimiters."""
+        diff = _make_line_diff("missing_line", ref_text="gone",
+                               reference_index=0)
+        result = _make_result_with_page_diffs([diff])
+        plain = build_structure_report_plain_text(result)
+
+        assert "=" * 60 in plain, "Plain text should have '===' section headers"
+        assert "-" * 60 in plain, "Plain text should have '---' separators"
+        assert "PDF Structure Comparison Report" in plain
+
+    def test_plain_text_has_diff_symbols(self):
+        """Plain text report uses -, +, ~ symbols for diff categories."""
+        result = StructureComparisonResult()
+        result.add_difference(_make_line_diff(
+            "missing_line", ref_text="removed", reference_index=0))
+        result.add_difference(_make_line_diff(
+            "extra_line", cand_text="added", candidate_index=1))
+        result.add_difference(_make_line_diff(
+            "text_mismatch", ref_text="old", cand_text="new",
+            reference_index=2, candidate_index=2))
+
+        plain = build_structure_report_plain_text(result)
+
+        # Check for the diff marker symbols
+        assert '  - "removed"' in plain, "Missing line should use '-' symbol"
+        assert '  + "added"' in plain, "Extra line should use '+' symbol"
+        assert '  ~ ref:' in plain, "Mismatch should use '~' symbol"
+
+
+# ===========================================================================
+# 24 - _classify_diff_type helper
+# ===========================================================================
+
+
+class TestClassifyDiffType:
+
+    @pytest.mark.parametrize("diff_type,expected", [
+        ("missing_line", "missing"),
+        ("missing_text", "missing"),
+        ("missing_page", "missing"),
+        ("missing_words", "missing"),
+        ("extra_line", "extra"),
+        ("extra_text", "extra"),
+        ("extra_page", "extra"),
+        ("extra_words", "extra"),
+        ("text_mismatch", "mismatch"),
+        ("word_mismatch", "mismatch"),
+        ("geometry_mismatch", "geometry"),
+        ("block_count_mismatch", "other"),
+        ("unknown_type", "other"),
+    ])
+    def test_classify_diff_types(self, diff_type, expected):
+        assert _classify_diff_type(diff_type) == expected
+
+
+# ===========================================================================
+# 25-26 - _group_into_hunks helper
+# ===========================================================================
+
+
+class TestGroupIntoHunks:
+
+    def test_group_into_hunks_empty(self):
+        """Empty list of differences produces empty hunk list."""
+        hunks = _group_into_hunks([], context_lines=3)
+        assert hunks == []
+
+    def test_group_into_hunks_single(self):
+        """Single difference produces exactly one hunk."""
+        diff = _make_line_diff("missing_line", ref_text="solo",
+                               reference_index=5)
+        hunks = _group_into_hunks([diff], context_lines=3)
+
+        assert len(hunks) == 1
+        assert hunks[0]["start_index"] == 5
+        assert hunks[0]["end_index"] == 5
+        assert len(hunks[0]["differences"]) == 1
+
+    def test_group_into_hunks_with_source_texts(self):
+        """Hunks include context_before and context_after when source_texts given."""
+        texts = [f"line_{i}" for i in range(20)]
+        diff = _make_line_diff("missing_line", ref_text="line_10",
+                               reference_index=10)
+        hunks = _group_into_hunks([diff], context_lines=2, source_texts=texts)
+
+        assert len(hunks) == 1
+        # context_before: texts[8:10] = ["line_8", "line_9"]
+        assert hunks[0]["context_before"] == ["line_8", "line_9"]
+        # context_after: texts[11:13] = ["line_11", "line_12"]
+        assert hunks[0]["context_after"] == ["line_11", "line_12"]
+
+    def test_group_into_hunks_multiple_merged(self):
+        """Adjacent diffs within merge_threshold form a single hunk."""
+        diffs = [
+            _make_line_diff("missing_line", ref_text="a", reference_index=10),
+            _make_line_diff("missing_line", ref_text="b", reference_index=11),
+            _make_line_diff("missing_line", ref_text="c", reference_index=12),
+        ]
+        hunks = _group_into_hunks(diffs, context_lines=3)
+        assert len(hunks) == 1
+
+    def test_group_into_hunks_multiple_separated(self):
+        """Diffs far apart form separate hunks."""
+        diffs = [
+            _make_line_diff("missing_line", ref_text="a", reference_index=0),
+            _make_line_diff("missing_line", ref_text="b", reference_index=100),
+        ]
+        hunks = _group_into_hunks(diffs, context_lines=3)
+        assert len(hunks) == 2
+        assert hunks[0]["start_index"] == 0
+        assert hunks[1]["start_index"] == 100
+
+
+# ===========================================================================
+# Additional edge cases for internal helpers
+# ===========================================================================
+
+
+class TestEscapeHelper:
+
+    def test_escape_ampersand(self):
+        assert _escape("a & b") == "a &amp; b"
+
+    def test_escape_angle_brackets(self):
+        assert _escape("<div>") == "&lt;div&gt;"
+
+    def test_escape_quotes(self):
+        assert _escape('"hello"') == "&quot;hello&quot;"
+
+    def test_escape_non_string(self):
+        """_escape should handle non-string input via str() conversion."""
+        assert _escape(42) == "42"
+
+
+class TestTruncateHelper:
+
+    def test_truncate_short_text_unchanged(self):
+        text = "short"
+        assert _truncate(text) == text
+
+    def test_truncate_exact_boundary(self):
+        text = "x" * MAX_TEXT_DISPLAY_LENGTH
+        assert _truncate(text) == text
+        assert "..." not in _truncate(text)
+
+    def test_truncate_one_over_boundary(self):
+        text = "x" * (MAX_TEXT_DISPLAY_LENGTH + 1)
+        truncated = _truncate(text)
+        assert truncated.endswith("...")
+        assert len(truncated) == MAX_TEXT_DISPLAY_LENGTH
+
+    def test_truncate_custom_max_length(self):
+        text = "abcdefghij"  # 10 chars
+        truncated = _truncate(text, max_length=7)
+        assert truncated == "abcd..."
+        assert len(truncated) == 7
+
+
+# ===========================================================================
+# Overview table
+# ===========================================================================
+
+
+class TestOverviewTable:
+
+    def test_table_present_in_html(self):
+        """HTML report contains an overview table with columns."""
+        diff = _make_line_diff("missing_line", ref_text="gone",
+                               reference_index=5)
+        result = _make_result_with_page_diffs([diff])
+        html = build_structure_report(result)
+
+        assert "<table" in html
+        assert "Type" in html
+        assert "Reference" in html
+        assert "Candidate" in html
+        assert "Location" in html
+
+    def test_table_rows_match_diff_count(self):
+        """Table contains one row per difference plus a header row."""
+        result = StructureComparisonResult()
+        result.add_difference(_make_line_diff(
+            "missing_line", ref_text="a", reference_index=0))
+        result.add_difference(_make_line_diff(
+            "extra_line", cand_text="b", candidate_index=1))
+        html = build_structure_report(result)
+        # Two data rows + one header row = 3 total <tr> elements
+        assert html.count("<tr") == 3
+
+    def test_table_shows_location(self):
+        """Table location column contains page and line info."""
+        diff = _make_line_diff("text_mismatch", ref_text="old", cand_text="new",
+                               reference_index=7, candidate_index=7, page=2)
+        result = _make_result_with_page_diffs([diff], page=2)
+        html = build_structure_report(result)
+        assert "Page 2" in html
+        assert "line 7" in html
+
+    def test_collect_all_diffs_word_level_uses_word_label(self):
+        """Word-level diffs use 'word N' location label."""
+        result = StructureComparisonResult()
+        result.add_word_difference(DocumentWordDifference(
+            diff_type="word_mismatch",
+            message="changed",
+            ref_words=["old"],
+            cand_words=["new"],
+            ref_start_index=42,
+            ref_end_index=43,
+            cand_start_index=42,
+            cand_end_index=43,
+        ))
+        items = _collect_all_diffs(result)
+        assert len(items) == 1
+        assert items[0][1] == "word 42"
+
+
+# ===========================================================================
+# Zero-count badges hidden
+# ===========================================================================
+
+
+class TestZeroBadgesSuppressed:
+
+    def test_zero_missing_badge_hidden(self):
+        """When there are no missing diffs, '0 missing' badge is hidden."""
+        diff = _make_line_diff("text_mismatch", ref_text="old", cand_text="new",
+                               reference_index=0, candidate_index=0)
+        result = _make_result_with_page_diffs([diff])
+        html = build_structure_report(result)
+        assert "0 missing" not in html
+
+    def test_zero_extra_badge_hidden(self):
+        """When there are no extra diffs, '0 extra' badge is hidden."""
+        diff = _make_line_diff("missing_line", ref_text="gone",
+                               reference_index=0)
+        result = _make_result_with_page_diffs([diff])
+        html = build_structure_report(result)
+        assert "0 extra" not in html
+
+    def test_nonzero_badges_shown(self):
+        """Non-zero count badges are shown."""
+        diff = _make_line_diff("missing_line", ref_text="gone",
+                               reference_index=0)
+        result = _make_result_with_page_diffs([diff])
+        html = build_structure_report(result)
+        assert "1 missing" in html
+
+
+# ===========================================================================
+# Word index label
+# ===========================================================================
+
+
+class TestWordIndexLabel:
+
+    def test_word_level_hunk_uses_word_label(self):
+        """Word-level hunks use 'word N' instead of 'line N'."""
+        result = StructureComparisonResult()
+        result.add_word_difference(DocumentWordDifference(
+            diff_type="word_mismatch",
+            message="changed",
+            ref_words=["old"],
+            cand_words=["new"],
+            ref_start_index=15,
+            ref_end_index=16,
+            cand_start_index=15,
+            cand_end_index=16,
+        ))
+        html = build_structure_report(result)
+        assert "word 15" in html
+        assert "line 15" not in html
+
+    def test_page_level_hunk_uses_line_label(self):
+        """Page-level hunks use 'line N' label."""
+        diff = _make_line_diff("missing_line", ref_text="gone",
+                               reference_index=5)
+        result = _make_result_with_page_diffs([diff])
+        html = build_structure_report(result)
+        assert "line 5" in html
+
+
+# ===========================================================================
+# Context separator
+# ===========================================================================
+
+
+class TestContextSeparator:
+
+    def test_context_uses_pipe_separator(self):
+        """Context items are separated by ' | ' not just spaces."""
+        ref_texts = [f"word_{i}" for i in range(20)]
+        diff = _make_line_diff("missing_line", ref_text="word_10",
+                               reference_index=10)
+        result = _make_result_with_page_diffs([diff])
+        html = build_structure_report(result, reference_texts=ref_texts,
+                                      context_lines=3)
+        assert " | " in html
+
+
+# ===========================================================================
+# Comprehensive integration-like tests
+# ===========================================================================
+
+
+class TestReportIntegration:
+
+    def test_report_with_all_difference_types(self):
+        """Build a report combining page, document, and word differences."""
+        result = StructureComparisonResult()
+
+        # Page-level
+        result.add_difference(_make_line_diff(
+            "missing_line", ref_text="page_missing", reference_index=0))
+        result.add_difference(_make_line_diff(
+            "extra_line", cand_text="page_extra", candidate_index=1))
+        result.add_difference(_make_line_diff(
+            "text_mismatch", ref_text="old", cand_text="new",
+            reference_index=2, candidate_index=2))
+
+        # Document-level
+        result.add_document_difference(DocumentTextDifference(
+            diff_type="missing_text",
+            message="doc text missing",
+            ref_text="doc_line",
+            ref_index=0,
+        ))
+
+        # Word-level
+        result.add_word_difference(DocumentWordDifference(
+            diff_type="word_mismatch",
+            message="word changed",
+            ref_words=["alpha"],
+            cand_words=["beta"],
+            ref_start_index=0,
+            ref_end_index=1,
+            cand_start_index=0,
+            cand_end_index=1,
+        ))
+
+        meta = ReportMetadata(
+            reference_name="ref.pdf",
+            candidate_name="cand.pdf",
+            comparison_mode="full",
+            page_count_ref=3,
+            page_count_cand=3,
+        )
+
+        html = build_structure_report(result, metadata=meta)
+        plain = build_structure_report_plain_text(result, metadata=meta)
+
+        # HTML assertions
+        assert "PDF Structure Comparison Report" in html
+        assert "Page 1" in html
+        assert "Document (text-only)" in html
+        assert "Document (word-level)" in html
+        assert "ref.pdf" in html
+        assert "5" in html  # total differences
+
+        # Plain text assertions
+        assert "PDF Structure Comparison Report" in plain
+        assert "ref.pdf" in plain
+        assert "5 difference(s)" in plain
+
+    def test_multi_page_diffs(self):
+        """Differences on multiple pages appear under separate page headers."""
+        result = StructureComparisonResult()
+        result.add_difference(_make_line_diff(
+            "missing_line", page=1, ref_text="p1", reference_index=0))
+        result.add_difference(_make_line_diff(
+            "extra_line", page=3, cand_text="p3", candidate_index=0))
+
+        html = build_structure_report(result)
+
+        assert "Page 1" in html
+        assert "Page 3" in html
+
+    def test_summary_line_entries_in_report(self):
+        """result.summary entries appear in the HTML report."""
+        result = StructureComparisonResult()
+        result.add_difference(_make_line_diff(
+            "missing_line", ref_text="x", reference_index=0))
+        result.extend_summary("Page count mismatch: ref=2, cand=3")
+
+        html = build_structure_report(result)
+        assert "Page count mismatch" in html
+
+        plain = build_structure_report_plain_text(result)
+        assert "Page count mismatch" in plain
diff --git a/utest/test_unordered_comparison.py b/utest/test_unordered_comparison.py
new file mode 100644
index 0000000..068747c
--- /dev/null
+++ b/utest/test_unordered_comparison.py
@@ -0,0 +1,353 @@
+"""Unit tests for unordered (bag-of-words) comparison mode in compare_document_words().
+
+Tests cover:
+  - Basic unordered comparison: identical, reordered, missing, extra words
+  - Interaction with normalization flags (ligatures, word boundaries, case)
+  - Reporting: diff_type values, ref_words/cand_words content, message labels
+  - Side-by-side ordered vs unordered behaviour differences
+"""
+
+import pytest
+
+from DocTest.PdfStructureComparator import (
+    DocumentWordDifference,
+    StructureComparisonResult,
+    compare_document_words,
+)
+from DocTest.PdfStructureModels import (
+    DocumentStructure,
+    PageStructure,
+    StructureExtractionConfig,
+    TextBlock,
+    TextLine,
+    TextSpan,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helper
+# ---------------------------------------------------------------------------
+
+
+def _make_structure(pages_data):
+    """Build a DocumentStructure from simplified data.
+
+    pages_data: list of list of strings. Each outer list is a page,
+    each string is a line of text.
+    """
+    pages = []
+    for page_num, lines in enumerate(pages_data, 1):
+        text_lines = []
+        for idx, text in enumerate(lines):
+            text_lines.append(TextLine(
+                index=idx,
+                text=text,
+                bbox=(0.0, float(idx * 12), 100.0, float(idx * 12 + 12)),
+                fonts=set(),
+                spans=[TextSpan(text=text, font="Arial", size=12.0)],
+            ))
+        block = TextBlock(
+            index=0,
+            bbox=(0.0, 0.0, 100.0, float(len(lines) * 12)),
+            lines=text_lines,
+        )
+        pages.append(PageStructure(
+            page_number=page_num,
+            width=612.0,
+            height=792.0,
+            blocks=[block],
+        ))
+    return DocumentStructure(pages=pages, config=StructureExtractionConfig())
+
+
+# ===========================================================================
+# TestUnorderedComparisonBasic
+# ===========================================================================
+
+
+class TestUnorderedComparisonBasic:
+    """Core unordered comparison behaviour."""
+
+    def test_identical_documents_pass(self):
+        """Identical content in the same order passes with zero differences."""
+        ref = _make_structure([["the quick brown fox"]])
+        cand = _make_structure([["the quick brown fox"]])
+        result = compare_document_words(ref, cand, compare_order="unordered")
+        assert result.passed
+        assert result.word_differences == []
+
+    def test_reordered_lines_pass(self):
+        """Lines in different order, same words -- passes unordered, fails ordered."""
+        ref = _make_structure([["alpha beta", "gamma delta"]])
+        cand = _make_structure([["gamma delta", "alpha beta"]])
+
+        unordered = compare_document_words(ref, cand, compare_order="unordered")
+        assert unordered.passed, "Unordered mode should pass when words are reordered"
+        assert unordered.word_differences == []
+
+        ordered = compare_document_words(ref, cand, compare_order="ordered")
+        assert not ordered.passed, "Ordered mode should fail when lines are swapped"
+        assert len(ordered.word_differences) >= 1
+
+    def test_reordered_across_pages_pass(self):
+        """Words shifted to different pages still pass in unordered mode."""
+        ref = _make_structure([["hello world"], ["foo bar"]])
+        cand = _make_structure([["foo bar"], ["hello world"]])
+
+        result = compare_document_words(ref, cand, compare_order="unordered")
+        assert result.passed
+        assert result.word_differences == []
+
+    def test_missing_word_detected(self):
+        """Reference has a word candidate doesn't -> diff_type='missing_words'."""
+        ref = _make_structure([["alpha beta gamma"]])
+        cand = _make_structure([["alpha gamma"]])
+
+        result = compare_document_words(ref, cand, compare_order="unordered")
+        assert not result.passed
+        missing = [d for d in result.word_differences if d.diff_type == "missing_words"]
+        assert len(missing) == 1
+        assert "beta" in missing[0].ref_words
+
+    def test_extra_word_detected(self):
+        """Candidate has a word reference doesn't -> diff_type='extra_words'."""
+        ref = _make_structure([["alpha gamma"]])
+        cand = _make_structure([["alpha beta gamma"]])
+
+        result = compare_document_words(ref, cand, compare_order="unordered")
+        assert not result.passed
+        extra = [d for d in result.word_differences if d.diff_type == "extra_words"]
+        assert len(extra) == 1
+        assert "beta" in extra[0].cand_words
+
+    def test_both_missing_and_extra(self):
+        """Ref has 'foo' not in cand; cand has 'bar' not in ref -> 2 diffs."""
+        ref = _make_structure([["alpha foo gamma"]])
+        cand = _make_structure([["alpha bar gamma"]])
+
+        result = compare_document_words(ref, cand, compare_order="unordered")
+        assert not result.passed
+
+        missing = [d for d in result.word_differences if d.diff_type == "missing_words"]
+        extra = [d for d in result.word_differences if d.diff_type == "extra_words"]
+        assert len(missing) == 1
+        assert len(extra) == 1
+        assert "foo" in missing[0].ref_words
+        assert "bar" in extra[0].cand_words
+
+    def test_duplicate_word_count_matters(self):
+        """Ref has 'hello hello', cand has 'hello' -> missing_words=['hello']."""
+        ref = _make_structure([["hello hello"]])
+        cand = _make_structure([["hello"]])
+
+        result = compare_document_words(ref, cand, compare_order="unordered")
+        assert not result.passed
+        missing = [d for d in result.word_differences if d.diff_type == "missing_words"]
+        assert len(missing) == 1
+        assert missing[0].ref_words == ["hello"]
+
+    def test_empty_documents_pass(self):
+        """Both documents empty -> passed=True, no differences."""
+        ref = _make_structure([])
+        cand = _make_structure([])
+
+        result = compare_document_words(ref, cand, compare_order="unordered")
+        assert result.passed
+        assert result.word_differences == []
+
+    def test_default_is_ordered(self):
+        """Without compare_order param, uses ordered comparison (same as before).
+
+        Swapped lines should fail under ordered mode (the default).
+        """
+        ref = _make_structure([["alpha beta", "gamma delta"]])
+        cand = _make_structure([["gamma delta", "alpha beta"]])
+
+        result = compare_document_words(ref, cand)
+        assert not result.passed, "Default (ordered) mode should fail on reordered text"
+
+
+# ===========================================================================
+# TestUnorderedWithNormalization
+# ===========================================================================
+
+
+class TestUnorderedWithNormalization:
+    """Unordered mode combined with normalization flags."""
+
+    def test_unordered_with_ligature_normalization(self):
+        """Ligature in ref, ASCII in cand -> passes with normalize_ligatures + unordered."""
+        # \ufb01 = fi ligature
+        ref = _make_structure([["the certi\ufb01cates are valid"]])
+        cand = _make_structure([["the certificates are valid"]])
+
+        result = compare_document_words(
+            ref, cand,
+            compare_order="unordered",
+            normalize_ligatures=True,
+        )
+        assert result.passed
+        assert result.word_differences == []
+
+    def test_unordered_with_word_boundary_normalization(self):
+        """Words split across lines merged before unordered comparison."""
+        # ref splits "path/file" across lines; cand has it on one line
+        ref = _make_structure([["path/", "file here"]])
+        cand = _make_structure([["path/file here"]])
+
+        result = compare_document_words(
+            ref, cand,
+            compare_order="unordered",
+            normalize_word_boundaries=True,
+        )
+        assert result.passed
+        assert result.word_differences == []
+
+    def test_unordered_with_case_insensitive(self):
+        """Case mismatch passes with case_sensitive=False + unordered."""
+        ref = _make_structure([["Hello World"]])
+        cand = _make_structure([["hello world"]])
+
+        result = compare_document_words(
+            ref, cand,
+            compare_order="unordered",
+            case_sensitive=False,
+        )
+        assert result.passed
+        assert result.word_differences == []
+
+    def test_unordered_combined_normalizations(self):
+        """All normalization options together with unordered mode.
+
+        Ref has ligature + split word + different case; cand has ASCII on one line.
+        Reordered across pages to exercise unordered logic.
+        """
+        # Page 1: ligature word split across lines with different case
+        # Page 2: normal text
+        ref = _make_structure([
+            ["the \ufb01le-", "name is ready"],
+            ["HELLO world"],
+        ])
+        # Candidate: same content, different order, normalized forms
+        cand = _make_structure([
+            ["hello world"],
+            ["the file-name is ready"],
+        ])
+
+        result = compare_document_words(
+            ref, cand,
+            compare_order="unordered",
+            case_sensitive=False,
+            normalize_ligatures=True,
+            normalize_word_boundaries=True,
+        )
+        assert result.passed
+        assert result.word_differences == []
+
+
+# ===========================================================================
+# TestUnorderedReporting
+# ===========================================================================
+
+
+class TestUnorderedReporting:
+    """Verify the content and shape of difference reports in unordered mode."""
+
+    def test_missing_words_report_content(self):
+        """ref_words contains the actual excess words from the reference."""
+        ref = _make_structure([["apple banana cherry"]])
+        cand = _make_structure([["apple cherry"]])
+
+        result = compare_document_words(ref, cand, compare_order="unordered")
+        missing = [d for d in result.word_differences if d.diff_type == "missing_words"]
+        assert len(missing) == 1
+        assert missing[0].ref_words is not None
+        assert "banana" in missing[0].ref_words
+
+    def test_extra_words_report_content(self):
+        """cand_words contains the actual excess words from the candidate."""
+        ref = _make_structure([["apple cherry"]])
+        cand = _make_structure([["apple banana cherry"]])
+
+        result = compare_document_words(ref, cand, compare_order="unordered")
+        extra = [d for d in result.word_differences if d.diff_type == "extra_words"]
+        assert len(extra) == 1
+        assert extra[0].cand_words is not None
+        assert "banana" in extra[0].cand_words
+
+    def test_message_contains_unordered_label(self):
+        """The message string should contain 'unordered' to clarify the mode."""
+        ref = _make_structure([["hello world"]])
+        cand = _make_structure([["hello"]])
+
+        result = compare_document_words(ref, cand, compare_order="unordered")
+        assert len(result.word_differences) >= 1
+        for diff in result.word_differences:
+            assert "unordered" in diff.message.lower(), (
+                f"Expected 'unordered' in message, got: {diff.message!r}"
+            )
+
+    def test_result_type_is_structure_comparison_result(self):
+        """Return type is always StructureComparisonResult."""
+        ref = _make_structure([["hello"]])
+        cand = _make_structure([["hello"]])
+
+        result = compare_document_words(ref, cand, compare_order="unordered")
+        assert isinstance(result, StructureComparisonResult)
+
+
+# ===========================================================================
+# TestUnorderedVsOrdered
+# ===========================================================================
+
+
+class TestUnorderedVsOrdered:
+    """Side-by-side comparison of ordered and unordered modes."""
+
+    def test_reorder_fails_ordered_passes_unordered(self):
+        """Direct comparison: ordered fails, unordered passes for reordered content."""
+        ref = _make_structure([["one two three four"]])
+        cand = _make_structure([["four three two one"]])
+
+        ordered_result = compare_document_words(ref, cand, compare_order="ordered")
+        unordered_result = compare_document_words(ref, cand, compare_order="unordered")
+
+        assert not ordered_result.passed, "Ordered should fail on reversed word order"
+        assert len(ordered_result.word_differences) >= 1
+
+        assert unordered_result.passed, "Unordered should pass when words are the same"
+        assert unordered_result.word_differences == []
+
+    def test_genuine_diff_fails_both_modes(self):
+        """Genuinely different content fails in both ordered and unordered modes."""
+        ref = _make_structure([["the quick brown fox"]])
+        cand = _make_structure([["the slow red cat"]])
+
+        ordered_result = compare_document_words(ref, cand, compare_order="ordered")
+        unordered_result = compare_document_words(ref, cand, compare_order="unordered")
+
+        assert not ordered_result.passed
+        assert not unordered_result.passed
+        assert len(ordered_result.word_differences) >= 1
+        assert len(unordered_result.word_differences) >= 1
+
+    def test_word_count_preservation(self):
+        """Same words but different frequency fails in both modes."""
+        ref = _make_structure([["hello hello world"]])
+        cand = _make_structure([["hello world world"]])
+
+        ordered_result = compare_document_words(ref, cand, compare_order="ordered")
+        unordered_result = compare_document_words(ref, cand, compare_order="unordered")
+
+        assert not ordered_result.passed, "Ordered should detect the word change"
+        assert not unordered_result.passed, "Unordered should detect frequency mismatch"
+
+        # In unordered mode, we expect both missing and extra diffs
+        missing = [d for d in unordered_result.word_differences
+                   if d.diff_type == "missing_words"]
+        extra = [d for d in unordered_result.word_differences
+                 if d.diff_type == "extra_words"]
+        assert len(missing) == 1
+        assert len(extra) == 1
+        assert "hello" in missing[0].ref_words
+        assert "world" in extra[0].cand_words
diff --git a/utest/test_word_normalization.py b/utest/test_word_normalization.py
new file mode 100644
index 0000000..6def5e9
--- /dev/null
+++ b/utest/test_word_normalization.py
@@ -0,0 +1,468 @@
+"""Unit tests for word-level normalization features.
+
+Tests cover:
+  - merge_split_words() in TextNormalization.py
+  - flatten_document_words() with normalize_ligatures_in_words and
+    normalize_word_boundaries keyword parameters
+  - compare_document_words() with normalize_ligatures and
+    normalize_word_boundaries keyword parameters
+"""
+
+import pytest
+
+from DocTest.PdfStructureModels import (
+    DocumentStructure,
+    PageStructure,
+    StructureExtractionConfig,
+    TextBlock,
+    TextLine,
+    TextSpan,
+    WordToken,
+    flatten_document_words,
+)
+from DocTest.PdfStructureComparator import (
+    StructureComparisonResult,
+    compare_document_words,
+)
+from DocTest.TextNormalization import (
+    _WORD_BOUNDARY_CONNECTORS,
+    merge_split_words,
+    normalize_ligatures,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_structure(pages_data):
+    """Build a DocumentStructure from simplified data.
+
+    pages_data: list of list of strings. Each outer list is a page,
+    each string is a line of text.
+    """
+    pages = []
+    for page_num, lines in enumerate(pages_data, 1):
+        text_lines = []
+        for idx, text in enumerate(lines):
+            text_lines.append(TextLine(
+                index=idx,
+                text=text,
+                bbox=(0.0, float(idx * 12), 100.0, float(idx * 12 + 12)),
+                fonts=set(),
+                spans=[TextSpan(text=text, font="Arial", size=12.0)],
+            ))
+        block = TextBlock(
+            index=0,
+            bbox=(0.0, 0.0, 100.0, float(len(lines) * 12)),
+            lines=text_lines,
+        )
+        pages.append(PageStructure(
+            page_number=page_num,
+            width=612.0,
+            height=792.0,
+            blocks=[block],
+        ))
+    return DocumentStructure(pages=pages, config=StructureExtractionConfig())
+
+
+def _make_tokens_from_lines(*line_groups):
+    """Build parallel word/token lists from line groups.
+
+    Each argument is a tuple of (line_index, page, words_string).
+    Words are split on whitespace from the string.  This allows precise
+    control over source_line_index for each word.
+
+    Returns (words, tokens) matching the merge_split_words signature.
+    """
+    words = []
+    tokens = []
+    global_word_idx = 0
+    for line_index, page, text in line_groups:
+        for w in text.split():
+            words.append(w)
+            tokens.append(WordToken(
+                text=w,
+                source_page=page,
+                source_line_index=line_index,
+                word_index=global_word_idx,
+            ))
+            global_word_idx += 1
+    return words, tokens
+
+
+# ===========================================================================
+# TestMergeSplitWords
+# ===========================================================================
+
+
+class TestMergeSplitWords:
+    """Tests for merge_split_words() in TextNormalization."""
+
+    def test_empty_input(self):
+        """Empty lists return empty lists."""
+        words, tokens = merge_split_words([], [])
+        assert words == []
+        assert tokens == []
+
+    def test_single_word(self):
+        """Single word returns unchanged."""
+        w = ["hello"]
+        t = [WordToken(text="hello", source_page=1, source_line_index=0, word_index=0)]
+        out_w, out_t = merge_split_words(w, t)
+        assert out_w == ["hello"]
+        assert len(out_t) == 1
+        assert out_t[0].text == "hello"
+
+    def test_no_merge_same_line(self):
+        """Two words from same line are NOT merged even if connector present."""
+        words, tokens = _make_tokens_from_lines(
+            (0, 1, "path/ file"),
+        )
+        out_w, out_t = merge_split_words(words, tokens)
+        assert out_w == ["path/", "file"]
+        assert len(out_t) == 2
+
+    def test_merge_slash_connector(self):
+        """Words ending with / from different lines get merged."""
+        words, tokens = _make_tokens_from_lines(
+            (0, 1, "path/to/"),
+            (1, 1, "file"),
+        )
+        out_w, out_t = merge_split_words(words, tokens)
+        assert out_w == ["path/to/file"]
+        assert len(out_t) == 1
+
+    def test_merge_hyphen_connector(self):
+        """Words ending with - from different lines get merged."""
+        words, tokens = _make_tokens_from_lines(
+            (0, 1, "anti-"),
+            (1, 1, "virus"),
+        )
+        out_w, out_t = merge_split_words(words, tokens)
+        assert out_w == ["anti-virus"]
+        assert len(out_t) == 1
+
+    def test_merge_backslash_connector(self):
+        r"""Words ending with \ from different lines get merged."""
+        words, tokens = _make_tokens_from_lines(
+            (0, 1, "C:\\Users\\"),
+            (1, 1, "name"),
+        )
+        out_w, out_t = merge_split_words(words, tokens)
+        assert out_w == ["C:\\Users\\name"]
+        assert len(out_t) == 1
+
+    def test_no_merge_without_connector(self):
+        """Words from different lines without connector stay separate."""
+        words, tokens = _make_tokens_from_lines(
+            (0, 1, "hello"),
+            (1, 1, "world"),
+        )
+        out_w, out_t = merge_split_words(words, tokens)
+        assert out_w == ["hello", "world"]
+        assert len(out_t) == 2
+
+    def test_multiple_consecutive_merges(self):
+        """Chain of merges: a/ + b/ + c each from different lines becomes a/b/c."""
+        words, tokens = _make_tokens_from_lines(
+            (0, 1, "a/"),
+            (1, 1, "b/"),
+            (2, 1, "c"),
+        )
+        out_w, out_t = merge_split_words(words, tokens)
+        assert out_w == ["a/b/c"]
+        assert len(out_t) == 1
+
+    def test_custom_connectors(self):
+        """Custom connectors={"_"} merges only on underscore."""
+        words, tokens = _make_tokens_from_lines(
+            (0, 1, "foo_"),
+            (1, 1, "bar"),
+        )
+        out_w, out_t = merge_split_words(words, tokens, connectors={"_"})
+        assert out_w == ["foo_bar"]
+        assert len(out_t) == 1
+
+    def test_custom_connectors_ignores_default(self):
+        """Custom connectors={"_"} does NOT merge on default slash."""
+        words, tokens = _make_tokens_from_lines(
+            (0, 1, "path/"),
+            (1, 1, "file"),
+        )
+        out_w, out_t = merge_split_words(words, tokens, connectors={"_"})
+        assert out_w == ["path/", "file"]
+        assert len(out_t) == 2
+
+    def test_token_provenance_preserved(self):
+        """Merged token keeps first token's source_page and source_line_index."""
+        t1 = WordToken(text="JS2/", source_page=2, source_line_index=5, word_index=10)
+        t2 = WordToken(text="H8", source_page=2, source_line_index=6, word_index=11)
+        out_w, out_t = merge_split_words(["JS2/", "H8"], [t1, t2])
+        assert out_w == ["JS2/H8"]
+        assert len(out_t) == 1
+        assert out_t[0].source_page == 2
+        assert out_t[0].source_line_index == 5
+        assert out_t[0].word_index == 10
+        assert out_t[0].text == "JS2/H8"
+
+    def test_mixed_merge_and_no_merge(self):
+        """Some pairs merge while others do not."""
+        # Line 0: "start path/"   (line 0)
+        # Line 1: "file end"      (line 1)
+        # "path/" ends with /, different line -> merge with "file"
+        # "start" does not end with connector -> no merge with "path/"
+        # "file" (after merge becomes "path/file") does not end with connector -> no merge with "end"
+        words, tokens = _make_tokens_from_lines(
+            (0, 1, "start path/"),
+            (1, 1, "file end"),
+        )
+        out_w, out_t = merge_split_words(words, tokens)
+        assert out_w == ["start", "path/file", "end"]
+        assert len(out_t) == 3
+
+    def test_merge_realistic_part_number(self):
+        """Realistic example: JS2_D48/F16/ + H8 from different lines."""
+        words, tokens = _make_tokens_from_lines(
+            (5, 1, "JS2_D48/F16/"),
+            (6, 1, "H8"),
+        )
+        out_w, out_t = merge_split_words(words, tokens)
+        assert out_w == ["JS2_D48/F16/H8"]
+        assert len(out_t) == 1
+
+
+# ===========================================================================
+# TestFlattenDocumentWordsNormalization
+# ===========================================================================
+
+
+class TestFlattenDocumentWordsNormalization:
+    """Tests for flatten_document_words() with new normalization params."""
+
+    def test_default_no_normalization(self):
+        """Default params return same result as before (no new normalization)."""
+        doc = _make_structure([["hello world", "foo bar"]])
+        words, tokens = flatten_document_words(doc)
+        assert words == ["hello", "world", "foo", "bar"]
+        assert len(tokens) == 4
+
+    def test_ligature_normalization(self):
+        """Words with ligatures are normalized when normalize_ligatures_in_words=True."""
+        # \ufb01 = fi ligature, \ufb02 = fl ligature
+        doc = _make_structure([["\ufb01le on the \ufb02oor"]])
+        words, tokens = flatten_document_words(
+            doc, normalize_ligatures_in_words=True,
+        )
+        # "file" and "floor" should appear with ASCII equivalents
+        assert "file" in words or "\ufb01le" in words
+        # When normalization is enabled, ligatures should be replaced
+        assert "\ufb01le" not in words
+        assert "\ufb02oor" not in words
+        assert "file" in words
+        assert "floor" in words
+
+    def test_ligature_normalization_disabled_by_default(self):
+        """Ligatures are preserved when normalize_ligatures_in_words is not set."""
+        doc = _make_structure([["\ufb01le"]])
+        words, tokens = flatten_document_words(doc)
+        # Without normalization, the ligature character should be preserved
+        assert words == ["\ufb01le"]
+
+    def test_word_boundary_normalization(self):
+        """Words split across lines get merged when normalize_word_boundaries=True."""
+        doc = _make_structure([["path/to/", "file here"]])
+        words, tokens = flatten_document_words(
+            doc, normalize_word_boundaries=True,
+        )
+        assert "path/to/file" in words
+        assert "here" in words
+
+    def test_word_boundary_normalization_disabled_by_default(self):
+        """Word boundaries are not merged by default."""
+        doc = _make_structure([["path/to/", "file here"]])
+        words, tokens = flatten_document_words(doc)
+        assert "path/to/" in words
+        assert "file" in words
+
+    def test_both_normalizations(self):
+        """Ligatures normalized AND boundaries merged when both enabled."""
+        # Line 1 has a word with ligature ending with connector
+        # Line 2 continues the word
+        doc = _make_structure([
+            ["the \ufb01le-", "name is test"],
+        ])
+        words, tokens = flatten_document_words(
+            doc,
+            normalize_ligatures_in_words=True,
+            normalize_word_boundaries=True,
+        )
+        # Ligature should be normalized: \ufb01le- -> file-
+        # Then merge across line boundary: file- + name -> file-name
+        assert "file-name" in words
+        assert "is" in words
+        assert "test" in words
+
+    def test_ligature_then_merge_order(self):
+        """Ligature normalization happens before merge.
+
+        If a word has a ligature AND ends with a connector, the ligature
+        must be resolved first so that the merged result is fully normalized.
+        """
+        # \ufb01 (fi) + "x/" on line 0, "bar" on line 1
+        doc = _make_structure([
+            ["\ufb01x/", "bar"],
+        ])
+        words, tokens = flatten_document_words(
+            doc,
+            normalize_ligatures_in_words=True,
+            normalize_word_boundaries=True,
+        )
+        # First ligature: \ufb01x/ -> fix/
+        # Then merge: fix/ + bar -> fix/bar
+        assert "fix/bar" in words
+
+    def test_tokens_length_matches_words(self):
+        """After normalization, words and tokens lists still have same length."""
+        doc = _make_structure([
+            ["path/", "file test"],
+        ])
+        words, tokens = flatten_document_words(
+            doc, normalize_word_boundaries=True,
+        )
+        assert len(words) == len(tokens)
+        for word, token in zip(words, tokens):
+            assert word == token.text
+
+
+# ===========================================================================
+# TestCompareDocumentWordsNormalization
+# ===========================================================================
+
+
+class TestCompareDocumentWordsNormalization:
+    """Tests for compare_document_words() with normalization params."""
+
+    def test_ligature_diff_without_normalization(self):
+        """Ligature vs ASCII text flags differences without normalization."""
+        ref = _make_structure([["the \ufb01le is here"]])
+        cand = _make_structure([["the file is here"]])
+        result = compare_document_words(ref, cand)
+        assert not result.passed
+        assert len(result.word_differences) >= 1
+
+    def test_ligature_diff_with_normalization(self):
+        """Ligature vs ASCII text passes with normalize_ligatures=True."""
+        ref = _make_structure([["the \ufb01le is here"]])
+        cand = _make_structure([["the file is here"]])
+        result = compare_document_words(
+            ref, cand, normalize_ligatures=True,
+        )
+        assert result.passed
+        assert result.word_differences == []
+
+    def test_word_boundary_diff_without_normalization(self):
+        """Word split across lines vs single line flags differences."""
+        # Reference has "path/to/file" split across two lines
+        ref = _make_structure([["path/to/", "file here"]])
+        # Candidate has it on one line
+        cand = _make_structure([["path/to/file here"]])
+        result = compare_document_words(ref, cand)
+        assert not result.passed
+        assert len(result.word_differences) >= 1
+
+    def test_word_boundary_diff_with_normalization(self):
+        """Word split across lines passes with normalize_word_boundaries=True."""
+        ref = _make_structure([["path/to/", "file here"]])
+        cand = _make_structure([["path/to/file here"]])
+        result = compare_document_words(
+            ref, cand, normalize_word_boundaries=True,
+        )
+        assert result.passed
+        assert result.word_differences == []
+
+    def test_combined_normalization(self):
+        """Both ligatures and word boundaries together make comparison pass."""
+        # Reference: ligature + split across lines
+        ref = _make_structure([
+            ["the \ufb01le-", "name is ready"],
+        ])
+        # Candidate: ASCII + single line
+        cand = _make_structure([
+            ["the file-name is ready"],
+        ])
+        result = compare_document_words(
+            ref, cand,
+            normalize_ligatures=True,
+            normalize_word_boundaries=True,
+        )
+        assert result.passed
+        assert result.word_differences == []
+
+    def test_combined_normalization_still_detects_real_diffs(self):
+        """Even with both normalizations, actual word differences are detected."""
+        ref = _make_structure([["the \ufb01le is ready"]])
+        cand = _make_structure([["the file is NOT ready"]])
+        result = compare_document_words(
+            ref, cand,
+            normalize_ligatures=True,
+            normalize_word_boundaries=True,
+        )
+        assert not result.passed
+        assert len(result.word_differences) >= 1
+
+    def test_normalization_with_case_insensitive(self):
+        """Normalization works together with case_sensitive=False."""
+        ref = _make_structure([["The \ufb01le is HERE"]])
+        cand = _make_structure([["the file is here"]])
+        result = compare_document_words(
+            ref, cand,
+            case_sensitive=False,
+            normalize_ligatures=True,
+        )
+        assert result.passed
+        assert result.word_differences == []
+
+    def test_result_type(self):
+        """compare_document_words with normalization returns StructureComparisonResult."""
+        ref = _make_structure([["hello"]])
+        cand = _make_structure([["hello"]])
+        result = compare_document_words(
+            ref, cand,
+            normalize_ligatures=True,
+            normalize_word_boundaries=True,
+        )
+        assert isinstance(result, StructureComparisonResult)
+
+    def test_word_boundary_across_pages(self):
+        """Word boundary normalization works across page boundaries too."""
+        # Reference: last word on page 1 ends with /, first word on page 2 continues
+        ref = _make_structure([
+            ["section/"],
+            ["header rest"],
+        ])
+        cand = _make_structure([
+            ["section/header rest"],
+        ])
+        result = compare_document_words(
+            ref, cand, normalize_word_boundaries=True,
+        )
+        assert result.passed
+        assert result.word_differences == []
+
+    def test_multiple_ligatures_in_document(self):
+        """Multiple ligature words across the document are all normalized."""
+        ref = _make_structure([
+            ["\ufb01rst \ufb02oor"],
+            ["\ufb03ce work"],
+        ])
+        cand = _make_structure([
+            ["first floor"],
+            ["ffice work"],
+        ])
+        result = compare_document_words(
+            ref, cand, normalize_ligatures=True,
+        )
+        assert result.passed
+        assert result.word_differences == []