diff --git a/DocTest/DocumentRepresentation.py b/DocTest/DocumentRepresentation.py
index b4ab734..9ed97e3 100644
--- a/DocTest/DocumentRepresentation.py
+++ b/DocTest/DocumentRepresentation.py
@@ -17,6 +17,7 @@
PageStructure,
StructureExtractionConfig,
build_page_structure,
+ build_page_structure_from_words,
)
from DocTest.config import DEFAULT_DPI, OCR_ENGINE_DEFAULT, DEFAULT_CONFIDENCE, MINIMUM_OCR_RESOLUTION, ADD_PIXELS_TO_IGNORE_AREA, TESSERACT_CONFIG
import tempfile
@@ -197,13 +198,27 @@ def get_pdf_structure(self, config: Optional[StructureExtractionConfig] = None)
cached = self._structure_cache.get(config)
if cached:
return cached
- structure = build_page_structure(
- page_number=self.page_number,
- pdf_dict=self.pdf_text_dict,
- config=config,
- dpi=self.dpi,
- image_shape=self.image.shape,
- )
+ if config.spatial_word_sorting and self.pdf_text_words:
+ # Derive page dimensions from the dict if available.
+ pw = float(self.pdf_text_dict.get("width", 0)) if self.pdf_text_dict else 0.0
+ ph = float(self.pdf_text_dict.get("height", 0)) if self.pdf_text_dict else 0.0
+ structure = build_page_structure_from_words(
+ page_number=self.page_number,
+ pdf_text_words=self.pdf_text_words,
+ config=config,
+ page_width=pw,
+ page_height=ph,
+ dpi=self.dpi,
+ image_shape=self.image.shape,
+ )
+ else:
+ structure = build_page_structure(
+ page_number=self.page_number,
+ pdf_dict=self.pdf_text_dict,
+ config=config,
+ dpi=self.dpi,
+ image_shape=self.image.shape,
+ )
self._structure_cache[config] = structure
return structure
diff --git a/DocTest/HeaderFooterDetector.py b/DocTest/HeaderFooterDetector.py
new file mode 100644
index 0000000..b31534f
--- /dev/null
+++ b/DocTest/HeaderFooterDetector.py
@@ -0,0 +1,230 @@
+"""Repetition-based header/footer detection for PDF structure comparison.
+
+Scans configurable vertical regions at the top/bottom of each page, identifies
+text lines that repeat across multiple pages (with digit normalization for page
+numbers), and removes them from the DocumentStructure before comparison.
+
+This module is a pure-function domain service with no side effects, no Robot
+Framework dependency, and no I/O.
+"""
+
+from __future__ import annotations
+
+import re
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Dict, FrozenSet, List, Set
+
+from DocTest.PdfStructureModels import (
+ DocumentStructure,
+ PageStructure,
+ TextBlock,
+ TextLine,
+)
+
+__all__ = [
+ "HeaderFooterConfig",
+ "DetectionResult",
+ "detect_repeating_headers_footers",
+ "strip_detected_headers_footers",
+ "filter_headers_footers",
+]
+
+_DIGIT_RUN_RE = re.compile(r"\d+")
+
+
+@dataclass(frozen=True)
+class HeaderFooterConfig:
+ """Configuration for repetition-based header/footer detection."""
+
+ header_scan_height: float = 0.0
+ footer_scan_height: float = 0.0
+ repeat_threshold: int = 2
+
+ @property
+ def enabled(self) -> bool:
+ """Return True if at least one scan region is configured."""
+ return self.header_scan_height > 0 or self.footer_scan_height > 0
+
+
+@dataclass(frozen=True)
+class DetectionResult:
+ """Immutable record of which normalized keys were detected as headers/footers."""
+
+ header_keys: FrozenSet[str]
+ footer_keys: FrozenSet[str]
+
+ @property
+ def has_detections(self) -> bool:
+ return bool(self.header_keys or self.footer_keys)
+
+
+def _normalize_for_grouping(text: str) -> str:
+ """Replace all digit runs with '#' so page-number variants group together.
+
+ Examples:
+ "Page 1 of 5" -> "Page # of #"
+ "ACME Corp" -> "ACME Corp" (no digits, unchanged)
+ "- 3 -" -> "- # -"
+ """
+ return _DIGIT_RUN_RE.sub("#", text)
+
+
+def detect_repeating_headers_footers(
+ structure: DocumentStructure,
+ config: HeaderFooterConfig,
+) -> DetectionResult:
+ """Scan a DocumentStructure and identify text that repeats across pages
+ in the header/footer regions.
+
+ Args:
+ structure: The document to scan.
+ config: Detection parameters (scan heights and threshold).
+
+ Returns:
+ A DetectionResult containing the normalized keys of detected
+ header and footer lines.
+ """
+ if not config.enabled:
+ return DetectionResult(header_keys=frozenset(), footer_keys=frozenset())
+
+ header_candidates: Dict[str, Set[int]] = defaultdict(set)
+ footer_candidates: Dict[str, Set[int]] = defaultdict(set)
+
+ for page in structure.pages:
+ footer_boundary = page.height - config.footer_scan_height
+
+ for block in page.blocks:
+ for line in block.lines:
+ text = line.text or ""
+ if not text:
+ continue
+ key = _normalize_for_grouping(text)
+
+ # Check header region
+ if config.header_scan_height > 0 and line.bbox[1] < config.header_scan_height:
+ header_candidates[key].add(page.page_number)
+
+ # Check footer region
+ if config.footer_scan_height > 0 and line.bbox[3] > footer_boundary:
+ footer_candidates[key].add(page.page_number)
+
+ detected_header_keys = frozenset(
+ key for key, pages in header_candidates.items() if len(pages) >= config.repeat_threshold
+ )
+ detected_footer_keys = frozenset(
+ key for key, pages in footer_candidates.items() if len(pages) >= config.repeat_threshold
+ )
+
+ return DetectionResult(
+ header_keys=detected_header_keys,
+ footer_keys=detected_footer_keys,
+ )
+
+
+def strip_detected_headers_footers(
+ structure: DocumentStructure,
+ detection: DetectionResult,
+ config: HeaderFooterConfig,
+) -> DocumentStructure:
+ """Remove detected header/footer lines from a DocumentStructure.
+
+ Only lines that (a) match a detected normalized key AND (b) fall within
+ the configured scan region are removed. Body lines with identical text
+ are preserved.
+
+ Args:
+ structure: The document to filter.
+ detection: The detection result from detect_repeating_headers_footers().
+ config: The same config used for detection (needed for region bounds).
+
+ Returns:
+ A new DocumentStructure with header/footer lines removed.
+ """
+ if not detection.has_detections:
+ return structure
+
+ filtered_pages: List[PageStructure] = []
+
+ for page in structure.pages:
+ footer_boundary = page.height - config.footer_scan_height
+ new_blocks: List[TextBlock] = []
+ next_line_index = 0
+
+ for block in page.blocks:
+ new_lines: List[TextLine] = []
+
+ for line in block.lines:
+ text = line.text or ""
+ key = _normalize_for_grouping(text)
+
+ # Remove if line is in header region AND matches a detected header key
+ if (
+ config.header_scan_height > 0
+ and line.bbox[1] < config.header_scan_height
+ and key in detection.header_keys
+ ):
+ continue
+
+ # Remove if line is in footer region AND matches a detected footer key
+ if (
+ config.footer_scan_height > 0
+ and line.bbox[3] > footer_boundary
+ and key in detection.footer_keys
+ ):
+ continue
+
+ new_lines.append(
+ TextLine(
+ index=next_line_index,
+ text=text,
+ bbox=line.bbox,
+ fonts=set(line.fonts),
+ spans=list(line.spans),
+ )
+ )
+ next_line_index += 1
+
+ if new_lines:
+ new_blocks.append(
+ TextBlock(
+ index=block.index,
+ bbox=block.bbox,
+ lines=new_lines,
+ )
+ )
+
+ filtered_pages.append(
+ PageStructure(
+ page_number=page.page_number,
+ width=page.width,
+ height=page.height,
+ blocks=new_blocks,
+ )
+ )
+
+ return DocumentStructure(pages=filtered_pages, config=structure.config)
+
+
+def filter_headers_footers(
+ structure: DocumentStructure,
+ config: HeaderFooterConfig,
+) -> DocumentStructure:
+ """Convenience function: detect and strip in one call.
+
+ Equivalent to:
+ detection = detect_repeating_headers_footers(structure, config)
+ return strip_detected_headers_footers(structure, detection, config)
+
+ Args:
+ structure: The document to process.
+ config: Detection parameters.
+
+ Returns:
+ A new DocumentStructure with detected repeating headers/footers removed.
+ If config.enabled is False, returns the input unchanged.
+ """
+ if not config.enabled:
+ return structure
+ detection = detect_repeating_headers_footers(structure, config)
+ return strip_detected_headers_footers(structure, detection, config)
diff --git a/DocTest/PdfStructureComparator.py b/DocTest/PdfStructureComparator.py
index 54db9cd..1cc3a05 100644
--- a/DocTest/PdfStructureComparator.py
+++ b/DocTest/PdfStructureComparator.py
@@ -4,16 +4,18 @@
from dataclasses import dataclass, field
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
-from DocTest.PdfStructureModels import DocumentStructure, PageStructure, TextLine
+from DocTest.PdfStructureModels import DocumentStructure, PageStructure, TextLine, WordToken
__all__ = [
"StructureTolerance",
"LineDifference",
"DocumentTextDifference",
+ "DocumentWordDifference",
"StructureComparisonResult",
"compare_document_structures",
"compare_document_text_only",
+ "compare_document_words",
]
@@ -54,6 +56,20 @@ class DocumentTextDifference:
cand_index: Optional[int] = None
+@dataclass
+class DocumentWordDifference:
+ """Details about word-level content mismatch in page-agnostic comparison."""
+
+ diff_type: str # "missing_words", "extra_words", "word_mismatch"
+ message: str
+ ref_words: Optional[List[str]] = None
+ cand_words: Optional[List[str]] = None
+ ref_start_index: Optional[int] = None
+ ref_end_index: Optional[int] = None
+ cand_start_index: Optional[int] = None
+ cand_end_index: Optional[int] = None
+
+
@dataclass
class StructureComparisonResult:
"""Aggregate differences found during structure comparison."""
@@ -61,6 +77,7 @@ class StructureComparisonResult:
passed: bool = True
page_differences: Dict[int, List[LineDifference]] = field(default_factory=dict)
document_differences: List[DocumentTextDifference] = field(default_factory=list)
+ word_differences: List[DocumentWordDifference] = field(default_factory=list)
summary: List[str] = field(default_factory=list)
def add_difference(self, diff: LineDifference):
@@ -72,13 +89,18 @@ def add_document_difference(self, diff: DocumentTextDifference):
self.passed = False
self.document_differences.append(diff)
+ def add_word_difference(self, diff: DocumentWordDifference):
+ """Add a document-level word difference."""
+ self.passed = False
+ self.word_differences.append(diff)
+
def extend_summary(self, message: str):
self.summary.append(message)
def difference_count(self) -> int:
- """Return total count of all differences (page-level and document-level)."""
+ """Return total count of all differences (page-level, document-level, and word-level)."""
page_diff_count = sum(len(diffs) for diffs in self.page_differences.values())
- return page_diff_count + len(self.document_differences)
+ return page_diff_count + len(self.document_differences) + len(self.word_differences)
def compare_document_structures(
@@ -229,6 +251,210 @@ def compare_document_text_only(
return result
+def _compare_words_unordered(
+ ref_words: List[str],
+ ref_originals: List[str],
+ cand_words: List[str],
+ cand_originals: List[str],
+) -> StructureComparisonResult:
+ """Compare words using bag-of-words (Counter-based) comparison.
+
+ This mode ignores word order entirely and only checks that both documents
+ contain the same words with the same frequencies. It is useful when text
+ reflows across pages cause identical content to appear in different order.
+
+ Excess words in the reference are reported as ``missing_words``, excess
+ words in the candidate as ``extra_words``.
+ """
+ from collections import Counter
+
+ result = StructureComparisonResult()
+
+ ref_counts = Counter(ref_words)
+ cand_counts = Counter(cand_words)
+
+ # Words that appear more in reference than candidate (missing from candidate)
+ ref_excess = ref_counts - cand_counts
+ # Words that appear more in candidate than reference (extra in candidate)
+ cand_excess = cand_counts - ref_counts
+
+ # Build original-text lists for reporting by scanning the original arrays
+ # and picking up excess instances
+ if ref_excess:
+ remaining = dict(ref_excess)
+ excess_originals: List[str] = []
+ for norm_word, orig_word in zip(ref_words, ref_originals):
+ if remaining.get(norm_word, 0) > 0:
+ excess_originals.append(orig_word)
+ remaining[norm_word] -= 1
+ if excess_originals:
+ preview = " ".join(excess_originals[:10])
+ if len(excess_originals) > 10:
+ preview += f" ... (+{len(excess_originals) - 10} more)"
+ result.add_word_difference(
+ DocumentWordDifference(
+ diff_type="missing_words",
+ message=f"Words in reference not found in candidate (unordered): '{_truncate_text(preview, 120)}'",
+ ref_words=excess_originals,
+ ref_start_index=0,
+ ref_end_index=len(excess_originals),
+ )
+ )
+
+ if cand_excess:
+ remaining_cand = dict(cand_excess)
+ cand_excess_originals: List[str] = []
+ for norm_word, orig_word in zip(cand_words, cand_originals):
+ if remaining_cand.get(norm_word, 0) > 0:
+ cand_excess_originals.append(orig_word)
+ remaining_cand[norm_word] -= 1
+ if cand_excess_originals:
+ preview = " ".join(cand_excess_originals[:10])
+ if len(cand_excess_originals) > 10:
+ preview += f" ... (+{len(cand_excess_originals) - 10} more)"
+ result.add_word_difference(
+ DocumentWordDifference(
+ diff_type="extra_words",
+ message=f"Extra words in candidate not found in reference (unordered): '{_truncate_text(preview, 120)}'",
+ cand_words=cand_excess_originals,
+ cand_start_index=0,
+ cand_end_index=len(cand_excess_originals),
+ )
+ )
+
+ return result
+
+
+def compare_document_words(
+ reference: DocumentStructure,
+ candidate: DocumentStructure,
+ *,
+ case_sensitive: bool = True,
+ normalize_ligatures: bool = False,
+ normalize_word_boundaries: bool = False,
+ compare_order: str = "ordered",
+) -> StructureComparisonResult:
+ """Compare document text at the word level, ignoring line and page boundaries.
+
+ Flattens all text into word tokens and uses SequenceMatcher to detect
+ insertions, deletions, and replacements at word granularity. Contiguous
+ diff opcodes of the same type are grouped into single difference records
+ for cleaner reporting.
+
+ Args:
+ reference: The reference document structure.
+ candidate: The candidate document structure to compare.
+ case_sensitive: Whether word comparison is case-sensitive.
+ normalize_ligatures: When True, replace known typographic ligatures
+ with their ASCII equivalents in each word before comparison.
+ normalize_word_boundaries: When True, merge tokens that were split
+ across line boundaries by connector characters (``/``, ``-``, ``\\``).
+ compare_order: Comparison strategy. ``"ordered"`` (default) uses
+ SequenceMatcher for sequence-sensitive comparison. ``"unordered"``
+ uses Counter-based bag-of-words comparison that ignores word order,
+ useful when text reflows across pages.
+
+ Returns:
+ A StructureComparisonResult with document-level word differences.
+ """
+ from DocTest.PdfStructureModels import flatten_document_words
+
+ result = StructureComparisonResult()
+
+ ref_words, ref_tokens = flatten_document_words(
+ reference,
+ normalize_word_boundaries=normalize_word_boundaries,
+ normalize_ligatures_in_words=normalize_ligatures,
+ )
+ cand_words, cand_tokens = flatten_document_words(
+ candidate,
+ normalize_word_boundaries=normalize_word_boundaries,
+ normalize_ligatures_in_words=normalize_ligatures,
+ )
+
+ # Preserve originals for reporting before potential case normalization
+ ref_originals = list(ref_words)
+ cand_originals = list(cand_words)
+
+ if not case_sensitive:
+ ref_words = [w.lower() for w in ref_words]
+ cand_words = [w.lower() for w in cand_words]
+
+ if compare_order == "unordered":
+ return _compare_words_unordered(ref_words, ref_originals, cand_words, cand_originals)
+
+ matcher = difflib.SequenceMatcher(a=ref_words, b=cand_words, autojunk=False)
+
+ for tag, i1, i2, j1, j2 in matcher.get_opcodes():
+ if tag == "equal":
+ continue
+
+ ref_slice = ref_originals[i1:i2] if i1 < i2 else None
+ cand_slice = cand_originals[j1:j2] if j1 < j2 else None
+
+ if tag == "replace":
+ ref_preview = " ".join(ref_slice) if ref_slice else ""
+ cand_preview = " ".join(cand_slice) if cand_slice else ""
+ message = (
+ f"Word mismatch at positions {i1}-{i2 - 1}: "
+ f"reference='{_truncate_text(ref_preview, 80)}', "
+ f"candidate='{_truncate_text(cand_preview, 80)}'"
+ )
+ result.add_word_difference(
+ DocumentWordDifference(
+ diff_type="word_mismatch",
+ message=message,
+ ref_words=ref_slice,
+ cand_words=cand_slice,
+ ref_start_index=i1,
+ ref_end_index=i2,
+ cand_start_index=j1,
+ cand_end_index=j2,
+ )
+ )
+
+ elif tag == "delete":
+ ref_preview = " ".join(ref_slice) if ref_slice else ""
+ message = (
+ f"Words missing in candidate at positions {i1}-{i2 - 1}: "
+ f"'{_truncate_text(ref_preview, 80)}'"
+ )
+ result.add_word_difference(
+ DocumentWordDifference(
+ diff_type="missing_words",
+ message=message,
+ ref_words=ref_slice,
+ ref_start_index=i1,
+ ref_end_index=i2,
+ )
+ )
+
+ elif tag == "insert":
+ cand_preview = " ".join(cand_slice) if cand_slice else ""
+ message = (
+ f"Extra words in candidate at positions {j1}-{j2 - 1}: "
+ f"'{_truncate_text(cand_preview, 80)}'"
+ )
+ result.add_word_difference(
+ DocumentWordDifference(
+ diff_type="extra_words",
+ message=message,
+ cand_words=cand_slice,
+ cand_start_index=j1,
+ cand_end_index=j2,
+ )
+ )
+
+ return result
+
+
+def _truncate_text(text: str, max_length: int) -> str:
+ """Truncate text with ellipsis if it exceeds max_length."""
+ if len(text) <= max_length:
+ return text
+ return text[: max_length - 3] + "..."
+
+
def _compare_page(
ref_page: PageStructure,
cand_page: PageStructure,
diff --git a/DocTest/PdfStructureModels.py b/DocTest/PdfStructureModels.py
index 8bfa6ef..153ae61 100644
--- a/DocTest/PdfStructureModels.py
+++ b/DocTest/PdfStructureModels.py
@@ -13,11 +13,14 @@
"PageStructure",
"DocumentStructure",
"StructureExtractionConfig",
+ "WordToken",
"strip_font_subset",
"collapse_whitespace",
"round_bbox",
"build_page_structure",
+ "build_page_structure_from_words",
"flatten_document_text",
+ "flatten_document_words",
]
@@ -80,6 +83,7 @@ class StructureExtractionConfig:
round_precision: Optional[int] = 3
normalize_ligatures: bool = False
character_replacements: Optional[Dict[str, str]] = None
+ spatial_word_sorting: bool = False
def __hash__(self) -> int: # Allow usage as dictionary key for caching.
# Convert character_replacements dict to a hashable tuple of sorted items
@@ -98,6 +102,7 @@ def __hash__(self) -> int: # Allow usage as dictionary key for caching.
self.round_precision,
self.normalize_ligatures,
replacements_hash,
+ self.spatial_word_sorting,
)
)
@@ -114,6 +119,15 @@ def page_count(self) -> int:
return len(self.pages)
+@dataclass(frozen=True)
+class WordToken:
+ """A single word token extracted from a document, with provenance metadata."""
+ text: str
+ source_page: int
+ source_line_index: int
+ word_index: int
+
+
def flatten_document_text(structure: DocumentStructure) -> List[str]:
"""Extract all text lines from a document in reading order, ignoring page boundaries.
@@ -137,6 +151,77 @@ def flatten_document_text(structure: DocumentStructure) -> List[str]:
return texts
+def flatten_document_words(
+ structure: DocumentStructure,
+ *,
+ normalize_word_boundaries: bool = False,
+ normalize_ligatures_in_words: bool = False,
+) -> Tuple[List[str], List[WordToken]]:
+ """Extract all words from a document in reading order, ignoring page/line boundaries.
+
+ Splits every text line on whitespace to produce individual word tokens.
+ This enables comparison at word granularity, making the comparison resilient
+ to text reflow caused by font or layout changes.
+
+ Args:
+ structure: A DocumentStructure containing pages with text blocks and lines.
+ normalize_word_boundaries: When True, merge tokens that were split
+ across line boundaries by connector characters (``/``, ``-``, ``\\``).
+ normalize_ligatures_in_words: When True, replace known typographic
+ ligatures with their ASCII equivalents in each word.
+
+ Returns:
+ A tuple of:
+ - words: Flat list of word strings for use with SequenceMatcher.
+ - tokens: Corresponding list of WordToken objects with provenance.
+ """
+ words: List[str] = []
+ tokens: List[WordToken] = []
+ global_line_index = 0
+ word_index = 0
+
+ for page in structure.pages:
+ for block in page.blocks:
+ for line in block.lines:
+ if not line.text:
+ global_line_index += 1
+ continue
+ line_words = line.text.split()
+ for w in line_words:
+ words.append(w)
+ tokens.append(
+ WordToken(
+ text=w,
+ source_page=page.page_number,
+ source_line_index=global_line_index,
+ word_index=word_index,
+ )
+ )
+ word_index += 1
+ global_line_index += 1
+
+ # Apply ligature normalization to individual words if requested
+ if normalize_ligatures_in_words:
+ from DocTest.TextNormalization import normalize_ligatures
+ words = [normalize_ligatures(w) for w in words]
+ tokens = [
+ WordToken(
+ text=normalize_ligatures(t.text),
+ source_page=t.source_page,
+ source_line_index=t.source_line_index,
+ word_index=t.word_index,
+ )
+ for t in tokens
+ ]
+
+ # Merge words split across line boundaries
+ if normalize_word_boundaries:
+ from DocTest.TextNormalization import merge_split_words
+ words, tokens = merge_split_words(words, tokens)
+
+ return words, tokens
+
+
def strip_font_subset(font_name: Optional[str]) -> Optional[str]:
"""Drop random subset prefixes inserted by PDF generators."""
@@ -282,3 +367,151 @@ def build_page_structure(
height=height,
blocks=blocks,
)
+
+
+def build_page_structure_from_words(
+ page_number: int,
+ pdf_text_words: Optional[List],
+ config: Optional[StructureExtractionConfig] = None,
+ *,
+ page_width: float = 0.0,
+ page_height: float = 0.0,
+ dpi: Optional[int] = None,
+ image_shape: Optional[Tuple[int, int, int]] = None,
+) -> PageStructure:
+ """Build a ``PageStructure`` from PyMuPDF ``get_text("words")`` output.
+
+ This bypasses block-level extraction entirely, grouping individual word
+ bounding boxes into lines using adaptive Y-proximity. The result is
+ immune to block fragmentation caused by different PDF generators.
+
+ Each word tuple from PyMuPDF has the form::
+
+ (x0, y0, x1, y1, "word", block_no, line_no, word_no)
+
+ Words are grouped into lines when their vertical midpoints are within
+ half the minimum word height of each other. Within each line, words
+ are ordered left-to-right by ``x0``. Lines are ordered top-to-bottom.
+ Each line becomes its own ``TextBlock`` (single-line blocks).
+
+ Args:
+ page_number: Zero-based page index.
+ pdf_text_words: List of word tuples from ``page.get_text("words")``.
+ config: Normalization settings (whitespace, ligatures, etc.).
+ page_width: Page width in points.
+ page_height: Page height in points.
+ dpi: Optional DPI for computing page dimensions from ``image_shape``.
+ image_shape: ``(height, width, channels)`` array shape, used with *dpi*
+ to derive page dimensions when ``page_width``/``page_height`` are zero.
+
+ Returns:
+ A ``PageStructure`` with one block per reconstructed text line.
+ """
+ config = config or StructureExtractionConfig()
+
+ width = page_width
+ height = page_height
+ if (width == 0.0 or height == 0.0) and image_shape and dpi:
+ px_height, px_width = image_shape[:2]
+ width = px_width * 72.0 / dpi
+ height = px_height * 72.0 / dpi
+
+ if not pdf_text_words:
+ return PageStructure(
+ page_number=page_number,
+ width=width,
+ height=height,
+ blocks=[],
+ )
+
+ # --- Group words into visual lines by Y-proximity ---
+ # Sort by vertical midpoint first, then horizontal position.
+ sorted_words = sorted(pdf_text_words, key=lambda w: ((w[1] + w[3]) / 2.0, w[0]))
+
+ lines: List[List] = [] # Each element: list of word tuples
+ line_y_mid: List[float] = [] # Representative Y midpoint per line
+ line_min_height: List[float] = [] # Cached minimum word height per line
+
+ for word in sorted_words:
+ w_y0, w_y1 = float(word[1]), float(word[3])
+ w_mid = (w_y0 + w_y1) / 2.0
+ w_height = max(w_y1 - w_y0, 1.0)
+
+ # Search backward from most recent line (words are Y-sorted, so the
+ # most recent line is the most likely match). Break early once we
+ # move past the tolerance range.
+ merged = False
+ max_possible_tolerance = w_height * 0.5
+ for idx in range(len(lines) - 1, -1, -1):
+ ly_mid = line_y_mid[idx]
+ delta = abs(w_mid - ly_mid)
+ if delta > max_possible_tolerance and w_mid > ly_mid:
+ break # Past tolerance; earlier lines are even further away.
+ tolerance = min(line_min_height[idx], w_height) * 0.5
+ if delta <= tolerance:
+ lines[idx].append(word)
+ n = len(lines[idx])
+ line_y_mid[idx] = ly_mid + (w_mid - ly_mid) / n
+ if w_height < line_min_height[idx]:
+ line_min_height[idx] = w_height
+ merged = True
+ break
+ if not merged:
+ lines.append([word])
+ line_y_mid.append(w_mid)
+ line_min_height.append(w_height)
+
+ # Sort lines top-to-bottom by midpoint, words left-to-right within each.
+ indexed_lines = sorted(enumerate(lines), key=lambda pair: line_y_mid[pair[0]])
+
+ blocks: List[TextBlock] = []
+ global_line_index = 0
+ block_index = 0
+
+ for _orig_idx, line_words in indexed_lines:
+ line_words_sorted = sorted(line_words, key=lambda w: float(w[0]))
+
+ # Build text from words, applying normalization.
+ text_parts: List[str] = []
+ for w in line_words_sorted:
+ raw = str(w[4])
+ normalized = _sanitize_span_text(raw, config)
+ if normalized:
+ text_parts.append(normalized)
+
+ line_text = config.whitespace_replacement.join(text_parts) if text_parts else ""
+ if config.strip_line_edges:
+ line_text = line_text.strip()
+ if config.drop_empty_lines and not line_text:
+ continue
+
+ # Compute line bbox as union of all word bboxes.
+ x0 = min(float(w[0]) for w in line_words_sorted)
+ y0 = min(float(w[1]) for w in line_words_sorted)
+ x1 = max(float(w[2]) for w in line_words_sorted)
+ y1 = max(float(w[3]) for w in line_words_sorted)
+ bbox = round_bbox((x0, y0, x1, y1), config.round_precision)
+
+ text_line = TextLine(
+ index=global_line_index,
+ text=line_text,
+ bbox=bbox,
+ fonts=set(),
+ spans=[TextSpan(text=line_text, font=None, size=0.0)],
+ )
+ blocks.append(
+ TextBlock(
+ index=block_index,
+ bbox=bbox,
+ lines=[text_line],
+ )
+ )
+ global_line_index += 1
+ block_index += 1
+
+ return PageStructure(
+ page_number=page_number,
+ width=width,
+ height=height,
+ blocks=blocks,
+ )
diff --git a/DocTest/PdfTest.py b/DocTest/PdfTest.py
index d617fd7..8f07624 100644
--- a/DocTest/PdfTest.py
+++ b/DocTest/PdfTest.py
@@ -15,7 +15,9 @@
StructureTolerance,
compare_document_structures,
compare_document_text_only,
+ compare_document_words,
)
+from DocTest.HeaderFooterDetector import HeaderFooterConfig, filter_headers_footers
from DocTest.PdfStructureModels import (
DocumentStructure,
PageStructure,
@@ -226,6 +228,10 @@ def compare_pdf_documents(self, reference_document, candidate_document, **kwargs
mask_value = kwargs.pop('mask', None)
text_mask_patterns_arg = kwargs.pop('text_mask_patterns', None)
ignore_ligatures = _as_bool(kwargs.pop('ignore_ligatures', False))
+ normalize_word_boundaries = _as_bool(kwargs.pop('normalize_word_boundaries', False), False)
+ compare_order = kwargs.pop('compare_order', 'ordered')
+ if compare_order not in ('ordered', 'unordered'):
+ compare_order = 'ordered'
check_pdf_text = _as_bool(kwargs.pop('check_pdf_text', False))
# Parse character_replacements from kwargs or use instance default
@@ -267,9 +273,19 @@ def compare_pdf_documents(self, reference_document, candidate_document, **kwargs
# New parameters for controlling structure comparison behavior
ignore_page_boundaries = _as_bool(kwargs.pop('ignore_page_boundaries', False), False)
+ compare_word_level = _as_bool(kwargs.pop('compare_word_level', True), True)
check_geometry = _as_bool(kwargs.pop('check_geometry', True), True)
check_block_count = _as_bool(kwargs.pop('check_block_count', True), True)
+ header_scan_height = _as_float(kwargs.pop('header_scan_height', 0), 0)
+ footer_scan_height = _as_float(kwargs.pop('footer_scan_height', 0), 0)
+ header_repeat_threshold = int(_as_float(kwargs.pop('header_repeat_threshold', 2), 2))
+ header_footer_config = HeaderFooterConfig(
+ header_scan_height=header_scan_height,
+ footer_scan_height=footer_scan_height,
+ repeat_threshold=header_repeat_threshold,
+ )
+
# When ignoring page boundaries, disable geometry and block count checks
if ignore_page_boundaries:
check_geometry = False
@@ -456,29 +472,29 @@ def _record_diff(facet: str, description: str, diff_payload: Any):
candidate_representation=candidate_repr,
text_mask_patterns=compiled_text_patterns,
ignore_page_boundaries=ignore_page_boundaries,
+ compare_word_level=compare_word_level,
check_geometry=check_geometry,
check_block_count=check_block_count,
+ header_footer_config=header_footer_config,
+ normalize_word_boundaries=normalize_word_boundaries,
+ compare_order=compare_order,
)
if not structure_result.passed:
differences_detected = True
summary = getattr(structure_result, "summary", None)
page_diffs = getattr(structure_result, "page_differences", None)
doc_diffs = getattr(structure_result, "document_differences", None)
- details_parts: List[str] = []
- if summary:
- details_parts.extend(str(item) for item in summary)
- if page_diffs:
- for page, diffs in page_diffs.items():
- for diff in diffs:
- details_parts.append(f"Page {page}: {diff.message}")
- if doc_diffs:
- for diff in doc_diffs:
- details_parts.append(f"Document: {diff.message}")
+ try:
+ from DocTest.StructureReportBuilder import build_structure_report_plain_text
+ plain_report = build_structure_report_plain_text(structure_result)
+ detail_text = plain_report if plain_report else "Structure comparison differences detected."
+ except Exception:
+ detail_text = "Structure comparison differences detected."
llm_differences.append(
{
"facet": "structure",
"description": "PDF structural comparison failed.",
- "details": "\n".join(details_parts) if details_parts else "Structure comparison differences detected.",
+ "details": detail_text,
}
)
@@ -557,8 +573,11 @@ def compare_pdf_structure(self, reference_document, candidate_document, **kwargs
- ``text_mask_patterns``: regex or list of regex strings to skip lines during comparison.
- ``ignore_ligatures`` (bool, default ``False``): normalise common ligatures (``fi`` → ``fi``) prior to comparison.
- ``ignore_page_boundaries`` (bool, default ``False``): ignore page breaks and compare text content in reading order across the entire document. When enabled, geometry and block structure are not checked. Useful when font/size changes cause text to reflow across pages.
+ - ``normalize_word_boundaries`` (bool, default ``False``): merge words split across line boundaries by connector characters (``/``, ``-``, ``\\``). Recommended when using ``ignore_page_boundaries``.
+ - ``compare_order`` (str, default ``"ordered"``): comparison strategy for word-level comparison. ``"ordered"`` uses sequence-sensitive matching; ``"unordered"`` uses bag-of-words frequency comparison that ignores word order, useful when text reflows across pages.
- ``check_geometry`` (bool, default ``True``): when ``False``, skip line position/size comparison. Useful for comparing content when layout may differ. Automatically set to ``False`` when ``ignore_page_boundaries`` is ``True``.
- ``check_block_count`` (bool, default ``True``): when ``False``, skip block count validation per page. Automatically set to ``False`` when ``ignore_page_boundaries`` is ``True``.
+ - ``spatial_word_sorting`` (bool, default ``False``): when ``True``, build page structure from individual word bounding boxes instead of text blocks. This bypasses block fragmentation differences caused by different PDF generators and produces consistent word ordering. Recommended when ``ignore_page_boundaries`` is ``True``.
Examples:
| `Compare Pdf Structure` reference.pdf candidate.pdf
@@ -566,6 +585,7 @@ def compare_pdf_structure(self, reference_document, candidate_document, **kwargs
| `Compare Pdf Structure` reference.pdf candidate.pdf mask=${CURDIR}${/}mask.json text_mask_patterns=\\d{4}-\\d{4} ignore_ligatures=${True}
| `Compare Pdf Structure` reference.pdf candidate.pdf ignore_page_boundaries=${True}
| `Compare Pdf Structure` reference.pdf candidate.pdf check_geometry=${False} check_block_count=${False}
+ | `Compare Pdf Structure` reference.pdf candidate.pdf ignore_page_boundaries=${True} spatial_word_sorting=${True}
| `Run Keyword And Expect Error` The compared PDF structure is different. Compare Pdf Structure reference.pdf candidate_with_changed_text.pdf
"""
@@ -589,6 +609,10 @@ def compare_pdf_structure(self, reference_document, candidate_document, **kwargs
mask_value = kwargs.get('mask')
text_mask_patterns_arg = kwargs.get('text_mask_patterns')
ignore_ligatures = _as_bool(kwargs.get('ignore_ligatures', False), False)
+ normalize_word_boundaries = _as_bool(kwargs.get('normalize_word_boundaries', False), False)
+ compare_order = kwargs.get('compare_order', 'ordered')
+ if compare_order not in ('ordered', 'unordered'):
+ compare_order = 'ordered'
# Parse character_replacements from kwargs or use instance default
char_replacements_arg = kwargs.get('character_replacements')
@@ -598,8 +622,19 @@ def compare_pdf_structure(self, reference_document, candidate_document, **kwargs
# New parameters for controlling comparison behavior
ignore_page_boundaries = _as_bool(kwargs.get('ignore_page_boundaries', False), False)
+ compare_word_level = _as_bool(kwargs.get('compare_word_level', True), True)
check_geometry = _as_bool(kwargs.get('check_geometry', True), True)
check_block_count = _as_bool(kwargs.get('check_block_count', True), True)
+ spatial_word_sorting = _as_bool(kwargs.get('spatial_word_sorting', False), False)
+
+ header_scan_height = _as_float(kwargs.get('header_scan_height', 0), 0)
+ footer_scan_height = _as_float(kwargs.get('footer_scan_height', 0), 0)
+ header_repeat_threshold = int(_as_float(kwargs.get('header_repeat_threshold', 2), 2))
+ header_footer_config = HeaderFooterConfig(
+ header_scan_height=header_scan_height,
+ footer_scan_height=footer_scan_height,
+ repeat_threshold=header_repeat_threshold,
+ )
# When ignoring page boundaries, disable geometry and block count checks
if ignore_page_boundaries:
@@ -615,6 +650,7 @@ def compare_pdf_structure(self, reference_document, candidate_document, **kwargs
round_precision=round_precision,
normalize_ligatures=ignore_ligatures,
character_replacements=char_replacements,
+ spatial_word_sorting=spatial_word_sorting,
)
tolerance = StructureTolerance(
position=position_tolerance,
@@ -655,8 +691,12 @@ def compare_pdf_structure(self, reference_document, candidate_document, **kwargs
candidate_representation=candidate_repr,
text_mask_patterns=compiled_text_patterns,
ignore_page_boundaries=ignore_page_boundaries,
+ compare_word_level=compare_word_level,
check_geometry=check_geometry,
check_block_count=check_block_count,
+ header_footer_config=header_footer_config,
+ normalize_word_boundaries=normalize_word_boundaries,
+ compare_order=compare_order,
)
finally:
reference_repr.close()
@@ -969,8 +1009,12 @@ def _perform_structure_comparison(
candidate_representation: Optional[DocumentRepresentation] = None,
text_mask_patterns: Optional[List[Pattern[str]]] = None,
ignore_page_boundaries: bool = False,
+ compare_word_level: bool = True,
check_geometry: bool = True,
check_block_count: bool = True,
+ header_footer_config: Optional["HeaderFooterConfig"] = None,
+ normalize_word_boundaries: bool = False,
+ compare_order: str = "ordered",
):
release_reference = False
release_candidate = False
@@ -985,17 +1029,31 @@ def _perform_structure_comparison(
reference_structure = reference_representation.get_pdf_structure(config=extraction_config)
candidate_structure = candidate_representation.get_pdf_structure(config=extraction_config)
+ # Repetition-based header/footer detection
+ if header_footer_config and header_footer_config.enabled:
+ reference_structure = filter_headers_footers(reference_structure, header_footer_config)
+ candidate_structure = filter_headers_footers(candidate_structure, header_footer_config)
+
if text_mask_patterns:
reference_structure = self._prune_structure_lines(reference_structure, text_mask_patterns)
candidate_structure = self._prune_structure_lines(candidate_structure, text_mask_patterns)
if ignore_page_boundaries:
- # Use text-only comparison that ignores page boundaries
- result = compare_document_text_only(
- reference=reference_structure,
- candidate=candidate_structure,
- case_sensitive=case_sensitive,
- )
+ if compare_word_level:
+ result = compare_document_words(
+ reference=reference_structure,
+ candidate=candidate_structure,
+ case_sensitive=case_sensitive,
+ normalize_ligatures=extraction_config.normalize_ligatures,
+ normalize_word_boundaries=normalize_word_boundaries,
+ compare_order=compare_order,
+ )
+ else:
+ result = compare_document_text_only(
+ reference=reference_structure,
+ candidate=candidate_structure,
+ case_sensitive=case_sensitive,
+ )
else:
# Use standard page-by-page comparison
result = compare_document_structures(
@@ -1006,7 +1064,45 @@ def _perform_structure_comparison(
check_geometry=check_geometry,
check_block_count=check_block_count,
)
- self._log_structure_result(result, ignore_page_boundaries=ignore_page_boundaries)
+ # Capture text lists for context display in the report
+ ref_texts = None
+ cand_texts = None
+ try:
+ from DocTest.PdfStructureModels import flatten_document_text
+ ref_texts = flatten_document_text(reference_structure)
+ cand_texts = flatten_document_text(candidate_structure)
+ except Exception:
+ pass
+
+ exclusions = []
+ if text_mask_patterns:
+ exclusions.extend(f"text_mask: {p.pattern}" for p in text_mask_patterns)
+ if header_footer_config and header_footer_config.enabled:
+ if header_footer_config.header_scan_height > 0:
+ exclusions.append(f"header_filter: {header_footer_config.header_scan_height}pt")
+ if header_footer_config.footer_scan_height > 0:
+ exclusions.append(f"footer_filter: {header_footer_config.footer_scan_height}pt")
+ # Only report disabled checks when explicitly set by the user,
+ # not when auto-disabled by ignore_page_boundaries
+ if not ignore_page_boundaries:
+ if not check_geometry:
+ exclusions.append("check_geometry: False")
+ if not check_block_count:
+ exclusions.append("check_block_count: False")
+ if normalize_word_boundaries:
+ exclusions.append("normalize_word_boundaries: True")
+ if compare_order == "unordered":
+ exclusions.append("compare_order: unordered")
+
+ self._log_structure_result(
+ result,
+ ignore_page_boundaries=ignore_page_boundaries,
+ reference_name=Path(reference_document).name,
+ candidate_name=Path(candidate_document).name,
+ reference_texts=ref_texts,
+ candidate_texts=cand_texts,
+ exclusions_applied=exclusions,
+ )
return result
finally:
if release_reference:
@@ -1060,12 +1156,23 @@ def _prune_structure_lines(
)
return DocumentStructure(pages=filtered_pages, config=structure.config)
- def _log_structure_result(self, result, ignore_page_boundaries: bool = False):
- """Log comparison results with single summary WARN and detail INFO messages.
+ def _log_structure_result(
+ self,
+ result,
+ *,
+ ignore_page_boundaries: bool = False,
+ reference_name: str = "",
+ candidate_name: str = "",
+ reference_texts: Optional[List[str]] = None,
+ candidate_texts: Optional[List[str]] = None,
+ exclusions_applied: Optional[List[str]] = None,
+ ):
+ """Log comparison results with single summary WARN, HTML report INFO, and detail DEBUG.
Robot Framework displays WARN messages at the top of log.html. To avoid
- cluttering that section, we emit a single summary warning and log all
- individual differences as INFO (visible only within keyword output).
+ cluttering that section, we emit a single summary warning. All differences
+ are rendered as a single consolidated HTML report at INFO level. Individual
+ per-difference output is preserved at DEBUG level for troubleshooting.
"""
if result.passed:
logger.info("[PDF Structure] Documents match within configured tolerances.")
@@ -1075,15 +1182,34 @@ def _log_structure_result(self, result, ignore_page_boundaries: bool = False):
diff_count = result.difference_count()
mode = "text-only (ignoring page boundaries)" if ignore_page_boundaries else "structure"
- # Single summary warning (appears at top of log.html)
+ # Single summary warning (appears at top of log.html) -- UNCHANGED
logger.warn(f"[PDF Structure] Comparison failed: {diff_count} difference(s) found in {mode} comparison.")
- # Log summary entries as INFO
+ # --- Consolidated HTML report at INFO level ---
+ try:
+ from DocTest.StructureReportBuilder import ReportMetadata, build_structure_report
+ metadata = ReportMetadata(
+ reference_name=reference_name or "(unknown)",
+ candidate_name=candidate_name or "(unknown)",
+ comparison_mode=mode,
+ exclusions_applied=exclusions_applied or [],
+ )
+ html_report = build_structure_report(
+ result,
+ metadata=metadata,
+ reference_texts=reference_texts,
+ candidate_texts=candidate_texts,
+ )
+ if html_report:
+ logger.info(html_report, html=True)
+ except Exception:
+ pass # Degrade gracefully if report builder fails
+
+ # --- Per-difference output at DEBUG level ---
if result.summary:
for entry in result.summary:
- logger.info(f"[PDF Structure] {entry}")
+ logger.debug(f"[PDF Structure] {entry}")
- # Log page differences as INFO
if result.page_differences:
for page in sorted(result.page_differences.keys()):
for diff in result.page_differences[page]:
@@ -1095,12 +1221,11 @@ def _log_structure_result(self, result, ignore_page_boundaries: bool = False):
details.append(f"candidate line={diff.candidate_index}")
if details:
message = f"{message} ({', '.join(details)})"
- logger.info(message)
+ logger.debug(message)
if diff.deltas:
pretty = ", ".join(f"{axis}={value:.3f}" for axis, value in diff.deltas.items())
logger.debug(f"[PDF Structure] Page {page} deltas: {pretty}")
- # Log document-level differences as INFO (for text-only mode)
if result.document_differences:
for diff in result.document_differences:
message = f"[PDF Text] {diff.message}"
@@ -1111,7 +1236,20 @@ def _log_structure_result(self, result, ignore_page_boundaries: bool = False):
details.append(f"candidate position={diff.cand_index}")
if details:
message = f"{message} ({', '.join(details)})"
- logger.info(message)
+ logger.debug(message)
+
+ # Log word-level differences at DEBUG
+ if hasattr(result, 'word_differences') and result.word_differences:
+ for diff in result.word_differences:
+ message = f"[PDF Words] {diff.message}"
+ details = []
+ if diff.ref_start_index is not None:
+ details.append(f"ref positions {diff.ref_start_index}-{diff.ref_end_index}")
+ if diff.cand_start_index is not None:
+ details.append(f"cand positions {diff.cand_start_index}-{diff.cand_end_index}")
+ if details:
+ message = f"{message} ({', '.join(details)})"
+ logger.debug(message)
def _ensure_local_document(self, document):
return download_file_from_url(document) if is_url(document) else document
diff --git a/DocTest/StructureReportBuilder.py b/DocTest/StructureReportBuilder.py
new file mode 100644
index 0000000..6e892d2
--- /dev/null
+++ b/DocTest/StructureReportBuilder.py
@@ -0,0 +1,558 @@
+"""Consolidated HTML report builder for PDF structure comparison results.
+
+Transforms a StructureComparisonResult into a single HTML fragment suitable
+for rendering inside Robot Framework's log.html via logger.info(msg, html=True).
+"""
+
+from __future__ import annotations
+
+import html as html_module
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+
+from DocTest.PdfStructureComparator import (
+ DocumentTextDifference,
+ DocumentWordDifference,
+ LineDifference,
+ StructureComparisonResult,
+)
+
+__all__ = [
+ "build_structure_report",
+ "build_structure_report_plain_text",
+ "ReportMetadata",
+]
+
+DEFAULT_CONTEXT_LINES = 3
+MAX_TEXT_DISPLAY_LENGTH = 120
+MAX_HUNKS_BEFORE_COLLAPSE = 20
+
+
+@dataclass
+class ReportMetadata:
+ """Metadata displayed in the report header."""
+ reference_name: str = ""
+ candidate_name: str = ""
+ comparison_mode: str = ""
+ page_count_ref: Optional[int] = None
+ page_count_cand: Optional[int] = None
+ exclusions_applied: List[str] = field(default_factory=list)
+
+
+@dataclass
+class ReportSummary:
+ """Aggregate statistics for the comparison."""
+ total_differences: int = 0
+ missing_count: int = 0
+ extra_count: int = 0
+ mismatch_count: int = 0
+ geometry_count: int = 0
+ other_count: int = 0
+ hunk_count: int = 0
+
+
+def _escape(text: str) -> str:
+ return html_module.escape(str(text), quote=True)
+
+
+def _truncate(text: str, max_length: int = MAX_TEXT_DISPLAY_LENGTH) -> str:
+ if len(text) <= max_length:
+ return text
+ return text[: max_length - 3] + "..."
+
+
+def _classify_diff_type(diff_type: str) -> str:
+ """Map diff_type string to category."""
+ if diff_type in ("missing_line", "missing_text", "missing_page", "missing_words"):
+ return "missing"
+ elif diff_type in ("extra_line", "extra_text", "extra_page", "extra_words"):
+ return "extra"
+ elif diff_type in ("text_mismatch", "word_mismatch"):
+ return "mismatch"
+ elif diff_type == "geometry_mismatch":
+ return "geometry"
+ else:
+ return "other"
+
+
+def _get_diff_display(diff: Any) -> Tuple[str, str, Optional[str], Optional[str]]:
+ """Extract category, message, ref_text, cand_text from any diff type."""
+ category = _classify_diff_type(diff.diff_type)
+ message = diff.message
+
+ ref_text = None
+ cand_text = None
+
+ if isinstance(diff, LineDifference):
+ ref_text = diff.ref_text
+ cand_text = diff.cand_text
+ elif isinstance(diff, DocumentTextDifference):
+ ref_text = diff.ref_text
+ cand_text = diff.cand_text
+ elif isinstance(diff, DocumentWordDifference):
+ ref_text = " ".join(diff.ref_words) if diff.ref_words else None
+ cand_text = " ".join(diff.cand_words) if diff.cand_words else None
+
+ return category, message, ref_text, cand_text
+
+
+_CATEGORY_STYLES = {
+ # (background, text_color, symbol) — chosen for WCAG AA contrast
+ "missing": ("#f8d7da", "#721c24", "-"),
+ "extra": ("#d4edda", "#155724", "+"),
+ "mismatch": ("#fff3cd", "#856404", "~"),
+ "geometry": ("#e2e3e5", "#383d41", "\u0394"), # delta symbol
+ "other": ("#e2e3e5", "#383d41", "!"),
+}
+
+
+def _compute_summary(result: StructureComparisonResult) -> ReportSummary:
+ """Compute aggregate statistics from a comparison result."""
+ summary = ReportSummary()
+
+ for diffs in result.page_differences.values():
+ for d in diffs:
+ cat = _classify_diff_type(d.diff_type)
+ if cat == "missing": summary.missing_count += 1
+ elif cat == "extra": summary.extra_count += 1
+ elif cat == "mismatch": summary.mismatch_count += 1
+ elif cat == "geometry": summary.geometry_count += 1
+ else: summary.other_count += 1
+
+ for d in result.document_differences:
+ cat = _classify_diff_type(d.diff_type)
+ if cat == "missing": summary.missing_count += 1
+ elif cat == "extra": summary.extra_count += 1
+ elif cat == "mismatch": summary.mismatch_count += 1
+ else: summary.other_count += 1
+
+ if hasattr(result, 'word_differences'):
+ for d in result.word_differences:
+ cat = _classify_diff_type(d.diff_type)
+ if cat == "missing": summary.missing_count += 1
+ elif cat == "extra": summary.extra_count += 1
+ elif cat == "mismatch": summary.mismatch_count += 1
+ else: summary.other_count += 1
+
+ summary.total_differences = (
+ summary.missing_count + summary.extra_count +
+ summary.mismatch_count + summary.geometry_count + summary.other_count
+ )
+ return summary
+
+
+def _render_diff_html(diff: Any) -> str:
+ """Render a single difference as an HTML div with color coding."""
+ category, message, ref_text, cand_text = _get_diff_display(diff)
+ bg, fg, symbol = _CATEGORY_STYLES.get(category, ("#e2e3e5", "#383d41", "?"))
+
+ parts = []
+ parts.append(f'
')
+
+ if category == "mismatch" and ref_text and cand_text:
+ parts.append(f'{_escape(symbol)} ref: "{_escape(_truncate(ref_text))}"')
+ parts.append(f'
cand: "{_escape(_truncate(cand_text))}"')
+ elif category == "missing" and ref_text:
+ parts.append(f'{_escape(symbol)} "{_escape(_truncate(ref_text))}"')
+ elif category == "extra" and cand_text:
+ parts.append(f'{_escape(symbol)} "{_escape(_truncate(cand_text))}"')
+ elif category == "geometry":
+ deltas_str = ""
+ if hasattr(diff, 'deltas') and diff.deltas:
+ deltas_str = " (" + ", ".join(f"{k}={v:.3f}" for k, v in diff.deltas.items()) + ")"
+ text_display = ref_text or cand_text or ""
+ parts.append(f'{_escape(symbol)} "{_escape(_truncate(text_display))}"{_escape(deltas_str)}')
+ else:
+ parts.append(f'{_escape(symbol)} {_escape(_truncate(message))}')
+
+ parts.append('
')
+ return "".join(parts)
+
+
+def _render_diff_plain(diff: Any) -> str:
+ """Render a single difference as plain text."""
+ category, message, ref_text, cand_text = _get_diff_display(diff)
+ _, _, symbol = _CATEGORY_STYLES.get(category, ("", "", "?"))
+
+ if category == "mismatch" and ref_text and cand_text:
+ return f' {symbol} ref: "{_truncate(ref_text)}"\n cand: "{_truncate(cand_text)}"'
+ elif category == "missing" and ref_text:
+ return f' {symbol} "{_truncate(ref_text)}"'
+ elif category == "extra" and cand_text:
+ return f' {symbol} "{_truncate(cand_text)}"'
+ else:
+ return f' {symbol} {_truncate(message)}'
+
+
+def _collect_all_diffs(result: StructureComparisonResult) -> List[Tuple[Any, str]]:
+ """Collect all differences with location labels for the overview table."""
+ items: List[Tuple[Any, str]] = []
+ for page_num in sorted(result.page_differences.keys()):
+ for d in result.page_differences[page_num]:
+ loc = f"Page {page_num}"
+ if isinstance(d, LineDifference):
+ idx = d.reference_index if d.reference_index is not None else d.candidate_index
+ if idx is not None:
+ loc += f", line {idx}"
+ items.append((d, loc))
+ for d in result.document_differences:
+ idx = d.ref_index if d.ref_index is not None else d.cand_index
+ loc = f"line {idx}" if idx is not None else "document"
+ items.append((d, loc))
+ if hasattr(result, 'word_differences'):
+ for d in result.word_differences:
+ idx = d.ref_start_index if d.ref_start_index is not None else d.cand_start_index
+ loc = f"word {idx}" if idx is not None else "document"
+ items.append((d, loc))
+ return items
+
+
+def _get_diff_index(diff: Any) -> int:
+ """Extract the primary positional index from a difference object."""
+ if isinstance(diff, LineDifference):
+ idx = diff.reference_index if diff.reference_index is not None else diff.candidate_index
+ return idx if idx is not None else 999999
+ elif isinstance(diff, DocumentTextDifference):
+ idx = diff.ref_index if diff.ref_index is not None else diff.cand_index
+ return idx if idx is not None else 999999
+ elif isinstance(diff, DocumentWordDifference):
+ idx = diff.ref_start_index if diff.ref_start_index is not None else diff.cand_start_index
+ return idx if idx is not None else 999999
+ return 999999
+
+
+def _group_into_hunks(
+ differences: Sequence[Any],
+ context_lines: int,
+ source_texts: Optional[List[str]] = None,
+) -> List[dict]:
+ """Group contiguous differences into hunks with context.
+
+ Returns list of dicts: {start_index, end_index, differences, context_before, context_after}
+ """
+ if not differences:
+ return []
+
+ sorted_diffs = sorted(differences, key=_get_diff_index)
+ merge_threshold = 2 * context_lines + 1
+
+ hunks = []
+ current_diffs = [sorted_diffs[0]]
+ current_start = _get_diff_index(sorted_diffs[0])
+ current_end = current_start
+
+ for diff in sorted_diffs[1:]:
+ idx = _get_diff_index(diff)
+ if idx - current_end <= merge_threshold:
+ current_diffs.append(diff)
+ current_end = max(current_end, idx)
+ else:
+ # Finalize current hunk
+ ctx_before = []
+ ctx_after = []
+ if source_texts:
+ start = max(0, current_start - context_lines)
+ ctx_before = source_texts[start:current_start]
+ end_pos = min(len(source_texts), current_end + context_lines + 1)
+ ctx_after = source_texts[current_end + 1:end_pos]
+ hunks.append({
+ "start_index": current_start,
+ "end_index": current_end,
+ "differences": current_diffs,
+ "context_before": ctx_before,
+ "context_after": ctx_after,
+ })
+ current_diffs = [diff]
+ current_start = idx
+ current_end = idx
+
+ # Finalize last hunk
+ ctx_before = []
+ ctx_after = []
+ if source_texts:
+ start = max(0, current_start - context_lines)
+ ctx_before = source_texts[start:current_start]
+ end_pos = min(len(source_texts), current_end + context_lines + 1)
+ ctx_after = source_texts[current_end + 1:end_pos]
+ hunks.append({
+ "start_index": current_start,
+ "end_index": current_end,
+ "differences": current_diffs,
+ "context_before": ctx_before,
+ "context_after": ctx_after,
+ })
+
+ return hunks
+
+
+def build_structure_report(
+ result: StructureComparisonResult,
+ *,
+ metadata: Optional[ReportMetadata] = None,
+ context_lines: int = DEFAULT_CONTEXT_LINES,
+ reference_texts: Optional[List[str]] = None,
+ candidate_texts: Optional[List[str]] = None,
+) -> str:
+ """Build a consolidated HTML report from a structure comparison result.
+
+ Returns an HTML string suitable for logger.info(msg, html=True).
+ Returns empty string if result.passed is True.
+ """
+ if result.passed:
+ return ""
+
+ summary = _compute_summary(result)
+ parts = []
+
+ # Outer container — explicit bg+color so report is self-contained in both light/dark mode
+ parts.append('')
+
+ # Title
+ parts.append('
PDF Structure Comparison Report
')
+
+ # Metadata
+ if metadata:
+ parts.append('
')
+ parts.append(f'
Reference: {_escape(metadata.reference_name)}
')
+ parts.append(f'
Candidate: {_escape(metadata.candidate_name)}
')
+ mode_str = _escape(metadata.comparison_mode)
+ page_str = ""
+ if metadata.page_count_ref is not None or metadata.page_count_cand is not None:
+ page_str = f' |
Pages: {metadata.page_count_ref or "?"} ref / {metadata.page_count_cand or "?"} cand'
+ parts.append(f'
Mode: {mode_str}{page_str}
')
+ if metadata.exclusions_applied:
+ exc_str = ", ".join(_escape(e) for e in metadata.exclusions_applied)
+ parts.append(f'
Exclusions: {exc_str}
')
+ parts.append('
')
+
+ # Summary
+ parts.append('
')
+ parts.append(f'
{summary.total_differences} difference(s)
')
+ parts.append('
')
+ if summary.missing_count:
+ parts.append(f'{summary.missing_count} missing')
+ if summary.extra_count:
+ parts.append(f'{summary.extra_count} extra')
+ if summary.mismatch_count:
+ parts.append(f'{summary.mismatch_count} mismatch')
+ if summary.geometry_count:
+ parts.append(f'{summary.geometry_count} geometry')
+ if summary.other_count:
+ parts.append(f'{summary.other_count} other')
+ parts.append('
')
+
+ # Differences overview table
+ all_diffs_for_table = _collect_all_diffs(result)
+ if all_diffs_for_table:
+ parts.append('
')
+ parts.append('
')
+ parts.append(''
+ '| # | '
+ 'Type | '
+ 'Reference | '
+ 'Candidate | '
+ 'Location |
')
+ for row_idx, (diff, location) in enumerate(all_diffs_for_table, 1):
+ category, _, ref_text, cand_text = _get_diff_display(diff)
+ bg, fg, symbol = _CATEGORY_STYLES.get(category, ("#e2e3e5", "#383d41", "?"))
+ ref_cell = _escape(_truncate(ref_text, 60)) if ref_text else "—"
+ cand_cell = _escape(_truncate(cand_text, 60)) if cand_text else "—"
+ parts.append(
+ f''
+ f'| {row_idx} | '
+ f'{_escape(symbol)} {_escape(category)} | '
+ f'{ref_cell} | '
+ f'{cand_cell} | '
+ f'{_escape(location)} |
')
+ parts.append('
')
+
+ # Content sections (hunk detail)
+ parts.append('
')
+
+ total_hunks = 0
+
+ # Page-level differences
+ if result.page_differences:
+ for page_num in sorted(result.page_differences.keys()):
+ diffs = result.page_differences[page_num]
+ hunks = _group_into_hunks(diffs, context_lines, reference_texts)
+ total_hunks += len(hunks)
+ parts.append(f'
'
+ f'Page {page_num} — {len(hunks)} hunk(s), {len(diffs)} difference(s)
')
+ for i, hunk in enumerate(hunks):
+ if total_hunks > MAX_HUNKS_BEFORE_COLLAPSE and i > 0:
+ parts.append(f'
... and more hunks (showing first {MAX_HUNKS_BEFORE_COLLAPSE})
')
+ break
+ _render_hunk_to_parts(parts, hunk, i + 1, index_label="line")
+
+ # Document-level differences
+ if result.document_differences:
+ hunks = _group_into_hunks(result.document_differences, context_lines, reference_texts)
+ total_hunks += len(hunks)
+ parts.append(f'
'
+ f'Document (text-only) — {len(hunks)} hunk(s), {len(result.document_differences)} difference(s)
')
+ for i, hunk in enumerate(hunks):
+ if total_hunks > MAX_HUNKS_BEFORE_COLLAPSE:
+ remaining = len(hunks) - i
+ parts.append(f'
... {remaining} more hunk(s) not shown
')
+ break
+ _render_hunk_to_parts(parts, hunk, i + 1, index_label="line")
+
+ # Word-level differences
+ if hasattr(result, 'word_differences') and result.word_differences:
+ hunks = _group_into_hunks(result.word_differences, context_lines, reference_texts)
+ total_hunks += len(hunks)
+ parts.append(f'
'
+ f'Document (word-level) — {len(hunks)} hunk(s), {len(result.word_differences)} difference(s)
')
+ rendered = 0
+ for i, hunk in enumerate(hunks):
+ if rendered >= MAX_HUNKS_BEFORE_COLLAPSE:
+ remaining = len(hunks) - rendered
+ parts.append(f'
... {remaining} more hunk(s) not shown
')
+ break
+ _render_hunk_to_parts(parts, hunk, i + 1, index_label="word")
+ rendered += 1
+
+ # Summary line
+ if result.summary:
+ parts.append('
')
+ for entry in result.summary:
+ parts.append(f'
{_escape(str(entry))}
')
+ parts.append('
')
+
+ parts.append('
') # close content
+ parts.append('
') # close outer container
+
+ summary.hunk_count = total_hunks
+ return "\n".join(parts)
+
+
+def _render_hunk_to_parts(parts: List[str], hunk: dict, hunk_number: int, index_label: str = "line") -> None:
+ """Render a hunk into the HTML parts list."""
+ start = hunk["start_index"]
+ end = hunk["end_index"]
+ if start == end:
+ label = f"{index_label} {start}"
+ else:
+ label = f"{index_label}s {start}–{end}"
+
+ parts.append(f'')
+ parts.append(f'
Hunk {hunk_number} ({label})
')
+
+ # Context before
+ if hunk["context_before"]:
+ ctx = " | ".join(_truncate(t, 40) for t in hunk["context_before"])
+ parts.append(f'
... {_escape(ctx)} ...
')
+
+ # Differences
+ for diff in hunk["differences"]:
+ parts.append(_render_diff_html(diff))
+
+ # Context after
+ if hunk["context_after"]:
+ ctx = " | ".join(_truncate(t, 40) for t in hunk["context_after"])
+ parts.append(f'
... {_escape(ctx)} ...
')
+
+ parts.append('
')
+
+
+def build_structure_report_plain_text(
+ result: StructureComparisonResult,
+ *,
+ metadata: Optional[ReportMetadata] = None,
+ context_lines: int = DEFAULT_CONTEXT_LINES,
+ reference_texts: Optional[List[str]] = None,
+ candidate_texts: Optional[List[str]] = None,
+) -> str:
+ """Build a plain-text version of the consolidated report.
+
+ Returns empty string if result.passed is True.
+ """
+ if result.passed:
+ return ""
+
+ summary = _compute_summary(result)
+ lines = []
+
+ lines.append("=" * 60)
+ lines.append("PDF Structure Comparison Report")
+ lines.append("=" * 60)
+
+ if metadata:
+ lines.append(f"Reference: {metadata.reference_name}")
+ lines.append(f"Candidate: {metadata.candidate_name}")
+ lines.append(f"Mode: {metadata.comparison_mode}")
+ if metadata.exclusions_applied:
+ lines.append(f"Exclusions: {', '.join(metadata.exclusions_applied)}")
+
+ lines.append("-" * 60)
+ lines.append(f"{summary.total_differences} difference(s): "
+ f"{summary.missing_count} missing, {summary.extra_count} extra, "
+ f"{summary.mismatch_count} mismatch, {summary.geometry_count} geometry, "
+ f"{summary.other_count} other")
+ lines.append("-" * 60)
+
+ # Page-level
+ if result.page_differences:
+ for page_num in sorted(result.page_differences.keys()):
+ diffs = result.page_differences[page_num]
+ hunks = _group_into_hunks(diffs, context_lines, reference_texts)
+ lines.append(f"\nPage {page_num} -- {len(hunks)} hunk(s), {len(diffs)} difference(s)")
+ for i, hunk in enumerate(hunks):
+ if i >= MAX_HUNKS_BEFORE_COLLAPSE:
+ lines.append(f" ... {len(hunks) - i} more hunk(s) not shown")
+ break
+ _render_hunk_plain(lines, hunk, i + 1, index_label="line")
+
+ # Document-level
+ if result.document_differences:
+ hunks = _group_into_hunks(result.document_differences, context_lines, reference_texts)
+ lines.append(f"\nDocument (text-only) -- {len(hunks)} hunk(s), {len(result.document_differences)} difference(s)")
+ for i, hunk in enumerate(hunks):
+ if i >= MAX_HUNKS_BEFORE_COLLAPSE:
+ lines.append(f" ... {len(hunks) - i} more hunk(s) not shown")
+ break
+ _render_hunk_plain(lines, hunk, i + 1, index_label="line")
+
+ # Word-level
+ if hasattr(result, 'word_differences') and result.word_differences:
+ hunks = _group_into_hunks(result.word_differences, context_lines, reference_texts)
+ lines.append(f"\nDocument (word-level) -- {len(hunks)} hunk(s), {len(result.word_differences)} difference(s)")
+ for i, hunk in enumerate(hunks):
+ if i >= MAX_HUNKS_BEFORE_COLLAPSE:
+ lines.append(f" ... {len(hunks) - i} more hunk(s) not shown")
+ break
+ _render_hunk_plain(lines, hunk, i + 1, index_label="word")
+
+ if result.summary:
+ lines.append("")
+ for entry in result.summary:
+ lines.append(f"Note: {entry}")
+
+ lines.append("=" * 60)
+ return "\n".join(lines)
+
+
+def _render_hunk_plain(lines: List[str], hunk: dict, hunk_number: int, index_label: str = "line") -> None:
+ """Render a hunk into the plain text lines list."""
+ start = hunk["start_index"]
+ end = hunk["end_index"]
+ if start == end:
+ label = f"{index_label} {start}"
+ else:
+ label = f"{index_label}s {start}-{end}"
+ lines.append(f" Hunk {hunk_number} ({label})")
+
+ if hunk["context_before"]:
+ ctx = " | ".join(_truncate(t, 40) for t in hunk["context_before"])
+ lines.append(f" ... {ctx} ...")
+
+ for diff in hunk["differences"]:
+ lines.append(_render_diff_plain(diff))
+
+ if hunk["context_after"]:
+ ctx = " | ".join(_truncate(t, 40) for t in hunk["context_after"])
+ lines.append(f" ... {ctx} ...")
diff --git a/DocTest/TextNormalization.py b/DocTest/TextNormalization.py
index 0153841..f8c9edf 100644
--- a/DocTest/TextNormalization.py
+++ b/DocTest/TextNormalization.py
@@ -1,6 +1,6 @@
from __future__ import annotations
-from typing import Dict, Optional
+from typing import Dict, List, Optional, Set, Tuple
_LIGATURE_MAP: Dict[str, str] = {
@@ -22,6 +22,71 @@ def normalize_ligatures(text: str) -> str:
return "".join(_LIGATURE_MAP.get(char, char) for char in text)
+_WORD_BOUNDARY_CONNECTORS: Set[str] = frozenset("/\\-")
+
+
+def merge_split_words(
+ words: List[str],
+ tokens: "List[WordToken]",
+ connectors: Set[str] | None = None,
+) -> "Tuple[List[str], List[WordToken]]":
+ """Merge word tokens that were split across PDF line boundaries.
+
+ When text reflows across lines in a PDF, words containing connector
+ characters (like ``/``, ``-``, ``\\``) can be split into separate tokens.
+ For example, ``JS2_D48/F16/H8`` may become ``["JS2_D48/F16/", "H8"]``
+ when the line break falls after the ``/``.
+
+ This function detects such splits by looking for tokens from consecutive
+ lines where the preceding token ends with a connector character, and
+ merges them back into a single token.
+
+ Args:
+ words: Flat list of word strings.
+ tokens: Corresponding WordToken provenance objects.
+ connectors: Set of characters that indicate a word was split.
+ Defaults to ``_WORD_BOUNDARY_CONNECTORS`` (``/``, ``\\``, ``-``).
+
+ Returns:
+ Tuple of (merged_words, merged_tokens) with reduced length.
+ """
+ if not words or len(words) <= 1:
+ return list(words), list(tokens)
+
+ if connectors is None:
+ connectors = _WORD_BOUNDARY_CONNECTORS
+
+ merged_words: List[str] = [words[0]]
+ merged_tokens: List[tokens[0].__class__] = [tokens[0]]
+
+ for i in range(1, len(words)):
+ prev_token = merged_tokens[-1]
+ curr_token = tokens[i]
+ prev_word = merged_words[-1]
+
+ # Only merge if tokens are from different lines AND previous word ends with connector.
+ # Skip standalone connectors (e.g. a bare "-" used as punctuation, not a split word).
+ if (prev_token.source_line_index != curr_token.source_line_index
+ and prev_word
+ and prev_word[-1] in connectors
+ and len(prev_word) > 1):
+ # Merge: concatenate words, keep first token's provenance
+ merged_words[-1] = prev_word + words[i]
+ # Update token with merged text
+ from DocTest.PdfStructureModels import WordToken
+ merged_tokens[-1] = WordToken(
+ text=merged_words[-1],
+ source_page=prev_token.source_page,
+ source_line_index=prev_token.source_line_index,
+ word_index=prev_token.word_index,
+ )
+ else:
+ merged_words.append(words[i])
+ merged_tokens.append(tokens[i])
+
+ return merged_words, merged_tokens
+
+
def apply_character_replacements(
text: str,
replacements: Optional[Dict[str, str]] = None,
diff --git a/utest/test_compare_document_words.py b/utest/test_compare_document_words.py
new file mode 100644
index 0000000..f579e1a
--- /dev/null
+++ b/utest/test_compare_document_words.py
@@ -0,0 +1,268 @@
+"""Unit tests for compare_document_words() -- ADR-001 Word-Level Token Comparison."""
+
+import pytest
+
+from DocTest.PdfStructureComparator import (
+ DocumentWordDifference,
+ StructureComparisonResult,
+ compare_document_words,
+)
+from DocTest.PdfStructureModels import (
+ DocumentStructure,
+ PageStructure,
+ StructureExtractionConfig,
+ TextBlock,
+ TextLine,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helper
+# ---------------------------------------------------------------------------
+
+
+def _make_doc(*page_texts):
+ """Create a DocumentStructure from lists of line texts per page.
+
+ Usage: _make_doc(["line1", "line2"], ["line3"]) creates 2 pages.
+ Each positional argument is a list of line-text strings for one page.
+ All lines are placed in a single block per page.
+ """
+ config = StructureExtractionConfig()
+ pages = []
+ for page_num, lines in enumerate(page_texts):
+ text_lines = []
+ for i, text in enumerate(lines):
+ text_lines.append(
+ TextLine(index=i, text=text, bbox=(0.0, 0.0, 100.0, 10.0))
+ )
+ block = TextBlock(index=0, bbox=(0.0, 0.0, 100.0, 100.0), lines=text_lines)
+ page = PageStructure(
+ page_number=page_num, width=612.0, height=792.0, blocks=[block]
+ )
+ pages.append(page)
+ return DocumentStructure(pages=pages, config=config)
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+def test_identical_content_same_lines():
+ """Same text, same lines -> passed=True, no word_differences."""
+ ref = _make_doc(["the quick brown fox"])
+ cand = _make_doc(["the quick brown fox"])
+ result = compare_document_words(ref, cand)
+ assert result.passed
+ assert result.word_differences == []
+
+
+def test_identical_content_different_lines():
+ """Same words split across different lines -> passed=True.
+
+ This is the KEY test for reflow tolerance: the words are identical,
+ only the line breaks differ.
+ """
+ ref = _make_doc(["the quick brown", "fox jumps"])
+ cand = _make_doc(["the quick", "brown fox jumps"])
+ result = compare_document_words(ref, cand)
+ assert result.passed
+ assert result.word_differences == []
+
+
+def test_identical_content_different_pages():
+ """Same words on different pages -> passed=True."""
+ ref = _make_doc(["hello world"], ["foo bar"])
+ cand = _make_doc(["hello world foo bar"])
+ result = compare_document_words(ref, cand)
+ assert result.passed
+ assert result.word_differences == []
+
+
+def test_single_word_replacement():
+ """'fox' vs 'cat' -> one word_mismatch difference."""
+ ref = _make_doc(["the quick fox"])
+ cand = _make_doc(["the quick cat"])
+ result = compare_document_words(ref, cand)
+ assert not result.passed
+ assert len(result.word_differences) >= 1
+ mismatch_diffs = [
+ d for d in result.word_differences if d.diff_type == "word_mismatch"
+ ]
+ assert len(mismatch_diffs) >= 1
+ diff = mismatch_diffs[0]
+ assert "fox" in diff.ref_words
+ assert "cat" in diff.cand_words
+
+
+def test_single_word_insertion():
+ """Candidate has extra word -> one extra_words difference."""
+ ref = _make_doc(["the fox"])
+ cand = _make_doc(["the quick fox"])
+ result = compare_document_words(ref, cand)
+ assert not result.passed
+ extra_diffs = [
+ d for d in result.word_differences if d.diff_type == "extra_words"
+ ]
+ assert len(extra_diffs) >= 1
+ diff = extra_diffs[0]
+ assert "quick" in diff.cand_words
+
+
+def test_single_word_deletion():
+ """Candidate missing a word -> one missing_words difference."""
+ ref = _make_doc(["the quick fox"])
+ cand = _make_doc(["the fox"])
+ result = compare_document_words(ref, cand)
+ assert not result.passed
+ missing_diffs = [
+ d for d in result.word_differences if d.diff_type == "missing_words"
+ ]
+ assert len(missing_diffs) >= 1
+ diff = missing_diffs[0]
+ assert "quick" in diff.ref_words
+
+
+def test_multi_word_replacement():
+ """Contiguous block of different words -> one grouped mismatch."""
+ ref = _make_doc(["the quick brown fox"])
+ cand = _make_doc(["the slow red fox"])
+ result = compare_document_words(ref, cand)
+ assert not result.passed
+ mismatch_diffs = [
+ d for d in result.word_differences if d.diff_type == "word_mismatch"
+ ]
+ assert len(mismatch_diffs) >= 1
+ # The replaced block should be grouped into a single diff
+ diff = mismatch_diffs[0]
+ assert diff.ref_words is not None
+ assert diff.cand_words is not None
+ assert "quick" in diff.ref_words
+ assert "brown" in diff.ref_words
+ assert "slow" in diff.cand_words
+ assert "red" in diff.cand_words
+
+
+def test_case_sensitive_default():
+ """'Hello' vs 'hello' -> mismatch when case_sensitive=True (default)."""
+ ref = _make_doc(["Hello World"])
+ cand = _make_doc(["hello World"])
+ result = compare_document_words(ref, cand)
+ assert not result.passed
+ assert len(result.word_differences) >= 1
+
+
+def test_case_insensitive():
+ """'Hello' vs 'hello' -> passed=True when case_sensitive=False."""
+ ref = _make_doc(["Hello WORLD"])
+ cand = _make_doc(["hello world"])
+ result = compare_document_words(ref, cand, case_sensitive=False)
+ assert result.passed
+ assert result.word_differences == []
+
+
+def test_both_empty_documents():
+ """Both empty -> passed=True."""
+ ref = _make_doc()
+ cand = _make_doc()
+ result = compare_document_words(ref, cand)
+ assert result.passed
+ assert result.word_differences == []
+
+
+def test_one_empty_one_not():
+ """One empty, one with text -> differences reported."""
+ ref = _make_doc(["hello world"])
+ cand = _make_doc()
+ result = compare_document_words(ref, cand)
+ assert not result.passed
+ assert len(result.word_differences) >= 1
+
+
+def test_difference_count_includes_word_diffs():
+ """result.difference_count() counts word_differences."""
+ ref = _make_doc(["the quick fox"])
+ cand = _make_doc(["the slow fox"])
+ result = compare_document_words(ref, cand)
+ assert result.difference_count() >= 1
+ assert result.difference_count() >= len(result.word_differences)
+
+
+def test_word_differences_have_correct_indices():
+ """Verify ref_start_index/ref_end_index/cand_start_index/cand_end_index."""
+ ref = _make_doc(["a b c d e"])
+ cand = _make_doc(["a b x d e"]) # 'c' replaced by 'x'
+ result = compare_document_words(ref, cand)
+ assert not result.passed
+ assert len(result.word_differences) >= 1
+
+ diff = result.word_differences[0]
+ # The replaced word 'c' is at index 2 in the reference
+ assert diff.ref_start_index is not None
+ assert diff.ref_end_index is not None
+ assert diff.cand_start_index is not None
+ assert diff.cand_end_index is not None
+ # 'c' is the 3rd word (index 2), so ref range should be [2, 3)
+ assert diff.ref_start_index == 2
+ assert diff.ref_end_index == 3
+ # 'x' is the 3rd word (index 2), so cand range should be [2, 3)
+ assert diff.cand_start_index == 2
+ assert diff.cand_end_index == 3
+
+
+def test_reflow_across_lines_and_pages():
+ """Complex reflow scenario: identical words, different line/page breaks.
+
+ Reference:
+ page 0: ["The quick brown fox", "jumps over the"]
+ page 1: ["lazy dog"]
+
+ Candidate:
+ page 0: ["The quick", "brown fox jumps"]
+ page 1: ["over the lazy dog"]
+
+ Should pass because the word sequence is identical.
+ """
+ ref = _make_doc(
+ ["The quick brown fox", "jumps over the"],
+ ["lazy dog"],
+ )
+ cand = _make_doc(
+ ["The quick", "brown fox jumps"],
+ ["over the lazy dog"],
+ )
+ result = compare_document_words(ref, cand)
+ assert result.passed
+ assert result.word_differences == []
+
+
+def test_result_is_structure_comparison_result():
+ """compare_document_words returns a StructureComparisonResult."""
+ ref = _make_doc(["hello"])
+ cand = _make_doc(["hello"])
+ result = compare_document_words(ref, cand)
+ assert isinstance(result, StructureComparisonResult)
+
+
+def test_word_difference_has_message():
+ """Each DocumentWordDifference has a non-empty message."""
+ ref = _make_doc(["hello world"])
+ cand = _make_doc(["hello earth"])
+ result = compare_document_words(ref, cand)
+ assert not result.passed
+ for diff in result.word_differences:
+ assert isinstance(diff.message, str)
+ assert len(diff.message) > 0
+
+
+def test_empty_ref_nonempty_cand():
+ """Empty reference, non-empty candidate -> extra words reported."""
+ ref = _make_doc()
+ cand = _make_doc(["hello world"])
+ result = compare_document_words(ref, cand)
+ assert not result.passed
+ extra_diffs = [
+ d for d in result.word_differences if d.diff_type == "extra_words"
+ ]
+ assert len(extra_diffs) >= 1
diff --git a/utest/test_flatten_document_words.py b/utest/test_flatten_document_words.py
new file mode 100644
index 0000000..f977d84
--- /dev/null
+++ b/utest/test_flatten_document_words.py
@@ -0,0 +1,164 @@
+"""Unit tests for flatten_document_words() -- ADR-001 Word-Level Token Comparison."""
+
+import pytest
+
+from DocTest.PdfStructureModels import (
+ DocumentStructure,
+ PageStructure,
+ StructureExtractionConfig,
+ TextBlock,
+ TextLine,
+ WordToken,
+ flatten_document_words,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helper
+# ---------------------------------------------------------------------------
+
+
+def _make_doc(*page_texts):
+ """Create a DocumentStructure from lists of line texts per page.
+
+ Usage: _make_doc(["line1", "line2"], ["line3"]) creates 2 pages.
+ Each positional argument is a list of line-text strings for one page.
+ All lines are placed in a single block per page.
+ """
+ config = StructureExtractionConfig()
+ pages = []
+ for page_num, lines in enumerate(page_texts):
+ text_lines = []
+ for i, text in enumerate(lines):
+ text_lines.append(
+ TextLine(index=i, text=text, bbox=(0.0, 0.0, 100.0, 10.0))
+ )
+ block = TextBlock(index=0, bbox=(0.0, 0.0, 100.0, 100.0), lines=text_lines)
+ page = PageStructure(
+ page_number=page_num, width=612.0, height=792.0, blocks=[block]
+ )
+ pages.append(page)
+ return DocumentStructure(pages=pages, config=config)
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+def test_empty_document():
+ """Empty DocumentStructure returns ([], [])."""
+ doc = _make_doc()
+ words, tokens = flatten_document_words(doc)
+ assert words == []
+ assert tokens == []
+
+
+def test_single_line_single_word():
+ """One line 'hello' produces ['hello'] and one WordToken."""
+ doc = _make_doc(["hello"])
+ words, tokens = flatten_document_words(doc)
+ assert words == ["hello"]
+ assert len(tokens) == 1
+ assert tokens[0].text == "hello"
+
+
+def test_single_line_multiple_words():
+ """'hello world' produces ['hello', 'world'] and two WordTokens."""
+ doc = _make_doc(["hello world"])
+ words, tokens = flatten_document_words(doc)
+ assert words == ["hello", "world"]
+ assert len(tokens) == 2
+ assert tokens[0].text == "hello"
+ assert tokens[1].text == "world"
+
+
+def test_multiple_lines():
+ """Two lines 'foo bar' and 'baz' produce ['foo', 'bar', 'baz'] in order."""
+ doc = _make_doc(["foo bar", "baz"])
+ words, tokens = flatten_document_words(doc)
+ assert words == ["foo", "bar", "baz"]
+ assert len(tokens) == 3
+
+
+def test_multiple_pages():
+ """Words from page 0 and page 1 are concatenated in order."""
+ doc = _make_doc(["alpha beta"], ["gamma"])
+ words, tokens = flatten_document_words(doc)
+ assert words == ["alpha", "beta", "gamma"]
+ assert len(tokens) == 3
+
+
+def test_empty_lines_skipped():
+ """Lines with empty text produce no tokens."""
+ doc = _make_doc(["hello", "", "world"])
+ words, tokens = flatten_document_words(doc)
+ assert words == ["hello", "world"]
+ assert len(tokens) == 2
+
+
+def test_whitespace_only_lines_skipped():
+ """Lines with only whitespace produce no tokens (split yields [])."""
+ doc = _make_doc(["hello", " ", "world"])
+ words, tokens = flatten_document_words(doc)
+ assert words == ["hello", "world"]
+ assert len(tokens) == 2
+
+
+def test_provenance_metadata_correct():
+ """source_page, source_line_index, and word_index are correct across pages."""
+ doc = _make_doc(["a b"], ["c"])
+ words, tokens = flatten_document_words(doc)
+
+ # First page, first line, word 0
+ assert tokens[0].text == "a"
+ assert tokens[0].source_page == 0
+ assert tokens[0].source_line_index == 0
+ assert tokens[0].word_index == 0
+
+ # First page, first line, word 1
+ assert tokens[1].text == "b"
+ assert tokens[1].source_page == 0
+ assert tokens[1].source_line_index == 0
+ assert tokens[1].word_index == 1
+
+ # Second page, first line, word 2
+ assert tokens[2].text == "c"
+ assert tokens[2].source_page == 1
+ assert tokens[2].source_line_index == 1
+ assert tokens[2].word_index == 2
+
+
+def test_multiple_spaces_normalized():
+ """'hello world' is split to ['hello', 'world'] (str.split normalizes)."""
+ doc = _make_doc(["hello world"])
+ words, tokens = flatten_document_words(doc)
+ assert words == ["hello", "world"]
+ assert len(tokens) == 2
+
+
+def test_word_index_is_global():
+ """word_index is sequential across all pages, blocks, and lines."""
+ doc = _make_doc(["a b", "c"], ["d e f"])
+ words, tokens = flatten_document_words(doc)
+ assert words == ["a", "b", "c", "d", "e", "f"]
+
+ expected_indices = list(range(6))
+ actual_indices = [t.word_index for t in tokens]
+ assert actual_indices == expected_indices
+
+
+def test_word_token_is_frozen():
+ """WordToken instances are immutable (frozen dataclass)."""
+ token = WordToken(text="hello", source_page=0, source_line_index=0, word_index=0)
+ with pytest.raises(AttributeError):
+ token.text = "changed"
+
+
+def test_words_and_tokens_have_same_length():
+ """The word strings list and tokens list always have the same length."""
+ doc = _make_doc(["the quick brown fox", "jumps over"], ["the lazy dog"])
+ words, tokens = flatten_document_words(doc)
+ assert len(words) == len(tokens)
+ for word, token in zip(words, tokens):
+ assert word == token.text
diff --git a/utest/test_header_footer_detection.py b/utest/test_header_footer_detection.py
new file mode 100644
index 0000000..ec210b3
--- /dev/null
+++ b/utest/test_header_footer_detection.py
@@ -0,0 +1,539 @@
+"""Unit tests for HeaderFooterDetector module (ADR-002).
+
+Tests cover repetition-based detection of headers/footers, stripping of
+detected lines, digit normalization for page numbers, and the convenience
+filter_headers_footers function.
+"""
+
+import pytest
+
+from DocTest.HeaderFooterDetector import (
+ DetectionResult,
+ HeaderFooterConfig,
+ _normalize_for_grouping,
+ detect_repeating_headers_footers,
+ filter_headers_footers,
+ strip_detected_headers_footers,
+)
+from DocTest.PdfStructureModels import (
+ DocumentStructure,
+ PageStructure,
+ StructureExtractionConfig,
+ TextBlock,
+ TextLine,
+ flatten_document_text,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_page(page_number, lines_data, width=612, height=792):
+ """Create a PageStructure from line data.
+
+ Args:
+ page_number: The 1-based page number.
+ lines_data: list of (text, y_top, y_bottom) tuples.
+ Each line gets bbox = (0, y_top, width, y_bottom).
+ width: Page width in PDF points.
+ height: Page height in PDF points.
+
+ Returns:
+ A PageStructure suitable for testing.
+ """
+ text_lines = []
+ for i, (text, y_top, y_bottom) in enumerate(lines_data):
+ text_lines.append(
+ TextLine(
+ index=i,
+ text=text,
+ bbox=(0.0, float(y_top), float(width), float(y_bottom)),
+ )
+ )
+ block = TextBlock(index=0, bbox=(0, 0, width, height), lines=text_lines)
+ return PageStructure(
+ page_number=page_number, width=width, height=height, blocks=[block]
+ )
+
+
+def _make_doc(*pages):
+ """Create a DocumentStructure from PageStructure objects."""
+ config = StructureExtractionConfig()
+ return DocumentStructure(pages=list(pages), config=config)
+
+
+# ---------------------------------------------------------------------------
+# Normalization helper tests
+# ---------------------------------------------------------------------------
+
+
+class TestNormalizeForGrouping:
+ """Tests for the _normalize_for_grouping helper."""
+
+ def test_replaces_single_digit(self):
+ assert _normalize_for_grouping("Page 1") == "Page #"
+
+ def test_replaces_multiple_digit_runs(self):
+ assert _normalize_for_grouping("Page 1 of 5") == "Page # of #"
+
+ def test_no_digits_unchanged(self):
+ assert _normalize_for_grouping("ACME Corp") == "ACME Corp"
+
+ def test_multi_digit_run(self):
+ assert _normalize_for_grouping("2024-01-15") == "#-#-#"
+
+ def test_empty_string(self):
+ assert _normalize_for_grouping("") == ""
+
+ def test_standalone_page_number(self):
+ assert _normalize_for_grouping("42") == "#"
+
+
+# ---------------------------------------------------------------------------
+# Config tests
+# ---------------------------------------------------------------------------
+
+
+class TestHeaderFooterConfig:
+ """Tests for HeaderFooterConfig properties."""
+
+ def test_detection_disabled_when_scan_height_zero(self):
+ """Both scan heights 0 means detection is disabled."""
+ config = HeaderFooterConfig(header_scan_height=0, footer_scan_height=0)
+ assert config.enabled is False
+
+ def test_config_enabled_with_header_only(self):
+ """Detection is enabled when only header_scan_height > 0."""
+ config = HeaderFooterConfig(header_scan_height=50, footer_scan_height=0)
+ assert config.enabled is True
+
+ def test_config_enabled_with_footer_only(self):
+ """Detection is enabled when only footer_scan_height > 0."""
+ config = HeaderFooterConfig(header_scan_height=0, footer_scan_height=50)
+ assert config.enabled is True
+
+ def test_config_enabled_with_both(self):
+ """Detection is enabled when both scan heights > 0."""
+ config = HeaderFooterConfig(header_scan_height=50, footer_scan_height=50)
+ assert config.enabled is True
+
+
+# ---------------------------------------------------------------------------
+# Detection tests
+# ---------------------------------------------------------------------------
+
+
+class TestDetectRepeatingHeadersFooters:
+ """Tests for detect_repeating_headers_footers."""
+
+ def test_disabled_config_returns_empty_result(self):
+ """When config.enabled is False, detection returns empty result."""
+ doc = _make_doc(
+ _make_page(1, [("ACME Corp", 10, 25), ("Body text", 100, 115)]),
+ _make_page(2, [("ACME Corp", 10, 25), ("More text", 100, 115)]),
+ )
+ config = HeaderFooterConfig(header_scan_height=0, footer_scan_height=0)
+ result = detect_repeating_headers_footers(doc, config)
+ assert result.header_keys == frozenset()
+ assert result.footer_keys == frozenset()
+ assert result.has_detections is False
+
+ def test_detects_identical_header_on_all_pages(self):
+ """Identical text in header region on all pages is detected."""
+ doc = _make_doc(
+ _make_page(1, [("ACME Corp", 10, 25), ("Body page 1", 100, 115)]),
+ _make_page(2, [("ACME Corp", 10, 25), ("Body page 2", 100, 115)]),
+ _make_page(3, [("ACME Corp", 10, 25), ("Body page 3", 100, 115)]),
+ )
+ config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+ result = detect_repeating_headers_footers(doc, config)
+ assert "ACME Corp" in result.header_keys
+ assert result.has_detections is True
+
+ def test_does_not_detect_non_repeating_text(self):
+ """Unique text in header region across pages is not detected."""
+ doc = _make_doc(
+ _make_page(1, [("Title A", 10, 25), ("Body 1", 100, 115)]),
+ _make_page(2, [("Title B", 10, 25), ("Body 2", 100, 115)]),
+ _make_page(3, [("Title C", 10, 25), ("Body 3", 100, 115)]),
+ )
+ config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+ result = detect_repeating_headers_footers(doc, config)
+ assert result.header_keys == frozenset()
+ assert result.has_detections is False
+
+ def test_detects_header_with_page_numbers(self):
+ """Page-number variants normalize to the same key and are detected."""
+ doc = _make_doc(
+ _make_page(1, [("Page 1 of 5", 10, 25), ("Body 1", 100, 115)]),
+ _make_page(2, [("Page 2 of 5", 10, 25), ("Body 2", 100, 115)]),
+ _make_page(3, [("Page 3 of 5", 10, 25), ("Body 3", 100, 115)]),
+ )
+ config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+ result = detect_repeating_headers_footers(doc, config)
+ assert "Page # of #" in result.header_keys
+
+ def test_respects_repeat_threshold_below(self):
+ """Text repeating on fewer pages than threshold is not detected."""
+ doc = _make_doc(
+ _make_page(1, [("Header", 10, 25), ("Body 1", 100, 115)]),
+ _make_page(2, [("Header", 10, 25), ("Body 2", 100, 115)]),
+ _make_page(3, [("Header", 10, 25), ("Body 3", 100, 115)]),
+ _make_page(4, [("Unique", 10, 25), ("Body 4", 100, 115)]),
+ _make_page(5, [("Unique2", 10, 25), ("Body 5", 100, 115)]),
+ )
+ config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=4)
+ result = detect_repeating_headers_footers(doc, config)
+ # "Header" only on 3 pages, threshold is 4
+ assert "Header" not in result.header_keys
+
+ def test_respects_repeat_threshold_at_boundary(self):
+ """Text repeating on exactly threshold pages is detected."""
+ doc = _make_doc(
+ _make_page(1, [("Header", 10, 25), ("Body 1", 100, 115)]),
+ _make_page(2, [("Header", 10, 25), ("Body 2", 100, 115)]),
+ _make_page(3, [("Header", 10, 25), ("Body 3", 100, 115)]),
+ _make_page(4, [("Unique", 10, 25), ("Body 4", 100, 115)]),
+ _make_page(5, [("Unique2", 10, 25), ("Body 5", 100, 115)]),
+ )
+ config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=3)
+ result = detect_repeating_headers_footers(doc, config)
+ assert "Header" in result.header_keys
+
+ def test_single_page_no_detection(self):
+ """Single page document never reaches threshold=2."""
+ doc = _make_doc(
+ _make_page(1, [("Header", 10, 25), ("Body", 100, 115)]),
+ )
+ config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+ result = detect_repeating_headers_footers(doc, config)
+ assert result.header_keys == frozenset()
+ assert result.has_detections is False
+
+ def test_footer_detection(self):
+ """Text in footer region repeating across pages is detected."""
+ # Page height = 792, footer_scan_height = 50 -> boundary at 742
+ # Lines at y_bottom=770 are past 742 -> in footer region
+ doc = _make_doc(
+ _make_page(1, [("Body 1", 100, 115), ("Copyright 2024", 755, 770)]),
+ _make_page(2, [("Body 2", 100, 115), ("Copyright 2024", 755, 770)]),
+ _make_page(3, [("Body 3", 100, 115), ("Copyright 2024", 755, 770)]),
+ )
+ config = HeaderFooterConfig(footer_scan_height=50, repeat_threshold=2)
+ result = detect_repeating_headers_footers(doc, config)
+ assert "Copyright #" in result.footer_keys
+ assert result.has_detections is True
+
+ def test_header_and_footer_simultaneously(self):
+ """Both header and footer can be detected independently."""
+ doc = _make_doc(
+ _make_page(1, [("ACME Corp", 10, 25), ("Body 1", 400, 415), ("Page 1", 760, 775)]),
+ _make_page(2, [("ACME Corp", 10, 25), ("Body 2", 400, 415), ("Page 2", 760, 775)]),
+ _make_page(3, [("ACME Corp", 10, 25), ("Body 3", 400, 415), ("Page 3", 760, 775)]),
+ )
+ config = HeaderFooterConfig(
+ header_scan_height=50, footer_scan_height=50, repeat_threshold=2
+ )
+ result = detect_repeating_headers_footers(doc, config)
+ assert "ACME Corp" in result.header_keys
+ assert "Page #" in result.footer_keys
+
+ def test_standalone_page_number_detection(self):
+ """Standalone page numbers like '1', '2', '3' normalize to '#'."""
+ doc = _make_doc(
+ _make_page(1, [("1", 10, 25), ("Body 1", 100, 115)]),
+ _make_page(2, [("2", 10, 25), ("Body 2", 100, 115)]),
+ _make_page(3, [("3", 10, 25), ("Body 3", 100, 115)]),
+ )
+ config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+ result = detect_repeating_headers_footers(doc, config)
+ assert "#" in result.header_keys
+
+ def test_threshold_greater_than_page_count(self):
+ """When threshold exceeds page count, nothing can be detected."""
+ doc = _make_doc(
+ _make_page(1, [("Header", 10, 25), ("Body 1", 100, 115)]),
+ _make_page(2, [("Header", 10, 25), ("Body 2", 100, 115)]),
+ _make_page(3, [("Header", 10, 25), ("Body 3", 100, 115)]),
+ )
+ config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=5)
+ result = detect_repeating_headers_footers(doc, config)
+ assert result.header_keys == frozenset()
+ assert result.has_detections is False
+
+ def test_line_outside_scan_region_not_counted(self):
+ """Text at y > header_scan_height is not counted as a header candidate."""
+ doc = _make_doc(
+ _make_page(1, [("ACME Corp", 60, 75), ("Body 1", 100, 115)]),
+ _make_page(2, [("ACME Corp", 60, 75), ("Body 2", 100, 115)]),
+ _make_page(3, [("ACME Corp", 60, 75), ("Body 3", 100, 115)]),
+ )
+ config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+ result = detect_repeating_headers_footers(doc, config)
+ # y_top=60 >= header_scan_height=50, so not in header region
+ assert "ACME Corp" not in result.header_keys
+
+
+# ---------------------------------------------------------------------------
+# Stripping tests
+# ---------------------------------------------------------------------------
+
+
+class TestStripDetectedHeadersFooters:
+ """Tests for strip_detected_headers_footers."""
+
+ def test_strips_detected_headers_preserves_body(self):
+ """Detected header lines are removed; body lines remain."""
+ doc = _make_doc(
+ _make_page(1, [("ACME Corp", 10, 25), ("Body line 1", 100, 115), ("Body line 2", 200, 215)]),
+ _make_page(2, [("ACME Corp", 10, 25), ("Body line 3", 100, 115)]),
+ )
+ config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+ detection = detect_repeating_headers_footers(doc, config)
+ result = strip_detected_headers_footers(doc, detection, config)
+
+ # All body lines preserved
+ all_texts = flatten_document_text(result)
+ assert "Body line 1" in all_texts
+ assert "Body line 2" in all_texts
+ assert "Body line 3" in all_texts
+ # Header removed
+ assert "ACME Corp" not in all_texts
+
+ def test_body_text_matching_header_not_stripped(self):
+ """Same text in body region is preserved even if it matches a header key."""
+ doc = _make_doc(
+ _make_page(1, [("ACME Corp", 10, 25), ("ACME Corp", 400, 415)]),
+ _make_page(2, [("ACME Corp", 10, 25), ("Other body", 400, 415)]),
+ _make_page(3, [("ACME Corp", 10, 25), ("More body", 400, 415)]),
+ )
+ config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+ detection = detect_repeating_headers_footers(doc, config)
+ result = strip_detected_headers_footers(doc, detection, config)
+
+ # Page 1: header "ACME Corp" at y=10 removed, body "ACME Corp" at y=400 preserved
+ page1_texts = []
+ for block in result.pages[0].blocks:
+ for line in block.lines:
+ page1_texts.append(line.text)
+ assert "ACME Corp" in page1_texts # The body-region instance survives
+
+ def test_strips_page_number_variants(self):
+ """Different page-number variants sharing the same key are all stripped."""
+ doc = _make_doc(
+ _make_page(1, [("Page 1 of 5", 10, 25), ("Body A", 100, 115)]),
+ _make_page(2, [("Page 2 of 5", 10, 25), ("Body B", 100, 115)]),
+ )
+ config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+ detection = detect_repeating_headers_footers(doc, config)
+ result = strip_detected_headers_footers(doc, detection, config)
+
+ all_texts = flatten_document_text(result)
+ assert "Page 1 of 5" not in all_texts
+ assert "Page 2 of 5" not in all_texts
+ assert "Body A" in all_texts
+ assert "Body B" in all_texts
+
+ def test_re_indexing_after_strip(self):
+ """After stripping, remaining lines have contiguous indices starting at 0."""
+ doc = _make_doc(
+ _make_page(1, [
+ ("Header", 10, 25),
+ ("Line A", 100, 115),
+ ("Line B", 200, 215),
+ ("Line C", 300, 315),
+ ]),
+ _make_page(2, [
+ ("Header", 10, 25),
+ ("Line D", 100, 115),
+ ]),
+ )
+ config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+ detection = detect_repeating_headers_footers(doc, config)
+ result = strip_detected_headers_footers(doc, detection, config)
+
+ # Page 1 should have lines indexed 0, 1, 2
+ page1_indices = [
+ line.index for block in result.pages[0].blocks for line in block.lines
+ ]
+ assert page1_indices == [0, 1, 2]
+
+ # Page 2 should have line indexed 0
+ page2_indices = [
+ line.index for block in result.pages[1].blocks for line in block.lines
+ ]
+ assert page2_indices == [0]
+
+ def test_empty_blocks_removed_after_strip(self):
+ """A block whose only line is a header gets removed entirely."""
+ # Create a page with two blocks: one with only a header, one with body
+ header_line = TextLine(index=0, text="Header", bbox=(0.0, 10.0, 612.0, 25.0))
+ body_line = TextLine(index=1, text="Body text", bbox=(0.0, 100.0, 612.0, 115.0))
+ header_block = TextBlock(index=0, bbox=(0, 0, 612, 30), lines=[header_line])
+ body_block = TextBlock(index=1, bbox=(0, 90, 612, 120), lines=[body_line])
+ page1 = PageStructure(page_number=1, width=612, height=792, blocks=[header_block, body_block])
+
+ header_line2 = TextLine(index=0, text="Header", bbox=(0.0, 10.0, 612.0, 25.0))
+ body_line2 = TextLine(index=1, text="More text", bbox=(0.0, 100.0, 612.0, 115.0))
+ header_block2 = TextBlock(index=0, bbox=(0, 0, 612, 30), lines=[header_line2])
+ body_block2 = TextBlock(index=1, bbox=(0, 90, 612, 120), lines=[body_line2])
+ page2 = PageStructure(page_number=2, width=612, height=792, blocks=[header_block2, body_block2])
+
+ doc = _make_doc(page1, page2)
+ config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+ detection = detect_repeating_headers_footers(doc, config)
+ result = strip_detected_headers_footers(doc, detection, config)
+
+ # Each page should have only 1 block (body_block), header_block removed
+ for page in result.pages:
+ assert len(page.blocks) == 1
+ assert page.blocks[0].lines[0].text != "Header"
+
+ def test_strips_footer_preserves_header_region(self):
+ """Footer stripping does not affect header-region text."""
+ doc = _make_doc(
+ _make_page(1, [("Title", 10, 25), ("Body", 400, 415), ("Footer", 760, 775)]),
+ _make_page(2, [("Title", 10, 25), ("Body 2", 400, 415), ("Footer", 760, 775)]),
+ _make_page(3, [("Title", 10, 25), ("Body 3", 400, 415), ("Footer", 760, 775)]),
+ )
+ # Only detect footer, not header
+ config = HeaderFooterConfig(header_scan_height=0, footer_scan_height=50, repeat_threshold=2)
+ detection = detect_repeating_headers_footers(doc, config)
+ result = strip_detected_headers_footers(doc, detection, config)
+
+ all_texts = flatten_document_text(result)
+ # Footer should be removed
+ assert "Footer" not in all_texts
+ # Header-region text preserved (not scanned as header since header_scan_height=0)
+ assert all_texts.count("Title") == 3
+
+ def test_no_detections_returns_original_structure(self):
+ """When detection has no results, strip returns the original structure."""
+ doc = _make_doc(
+ _make_page(1, [("Unique A", 10, 25), ("Body", 100, 115)]),
+ _make_page(2, [("Unique B", 10, 25), ("Body 2", 100, 115)]),
+ )
+ detection = DetectionResult(header_keys=frozenset(), footer_keys=frozenset())
+ config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+ result = strip_detected_headers_footers(doc, detection, config)
+ assert result is doc # Identity check: same object returned
+
+
+# ---------------------------------------------------------------------------
+# Convenience function tests
+# ---------------------------------------------------------------------------
+
+
+class TestFilterHeadersFooters:
+ """Tests for the filter_headers_footers convenience function."""
+
+ def test_filter_headers_footers_end_to_end(self):
+ """filter_headers_footers produces same result as detect + strip."""
+ doc = _make_doc(
+ _make_page(1, [("ACME Corp", 10, 25), ("Body 1", 100, 115), ("Page 1", 760, 775)]),
+ _make_page(2, [("ACME Corp", 10, 25), ("Body 2", 100, 115), ("Page 2", 760, 775)]),
+ _make_page(3, [("ACME Corp", 10, 25), ("Body 3", 100, 115), ("Page 3", 760, 775)]),
+ )
+ config = HeaderFooterConfig(
+ header_scan_height=50, footer_scan_height=50, repeat_threshold=2
+ )
+
+ # Manual two-step
+ detection = detect_repeating_headers_footers(doc, config)
+ expected = strip_detected_headers_footers(doc, detection, config)
+
+ # Convenience one-step
+ actual = filter_headers_footers(doc, config)
+
+ # Compare text content
+ assert flatten_document_text(actual) == flatten_document_text(expected)
+
+ def test_filter_disabled_returns_same_object(self):
+ """When config.enabled is False, the exact same object is returned."""
+ doc = _make_doc(
+ _make_page(1, [("Header", 10, 25), ("Body", 100, 115)]),
+ _make_page(2, [("Header", 10, 25), ("Body 2", 100, 115)]),
+ )
+ config = HeaderFooterConfig(header_scan_height=0, footer_scan_height=0)
+ result = filter_headers_footers(doc, config)
+ assert result is doc
+
+
+# ---------------------------------------------------------------------------
+# Key scenario: page without header content preserved
+# ---------------------------------------------------------------------------
+
+
+class TestPageWithoutHeaderContentPreserved:
+ """The key scenario: a page that lacks the repeating header must not
+ have its body text incorrectly removed."""
+
+ def test_page_without_header_content_preserved(self):
+ """Page 2 has 'HEADER' but page 3 starts with different body text at
+ the same y-position. That body text must NOT be removed."""
+ doc = _make_doc(
+ _make_page(1, [
+ ("HEADER", 10, 25),
+ ("Body page 1", 100, 115),
+ ]),
+ _make_page(2, [
+ ("HEADER", 10, 25),
+ ("Body page 2", 100, 115),
+ ]),
+ _make_page(3, [
+ # No header line -- body text starts at y=10, same as header
+ ("Important content", 10, 25),
+ ("Body page 3", 100, 115),
+ ]),
+ )
+ config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2)
+ detection = detect_repeating_headers_footers(doc, config)
+
+ # "HEADER" detected as header key
+ assert "HEADER" in detection.header_keys
+
+ result = strip_detected_headers_footers(doc, detection, config)
+ all_texts = flatten_document_text(result)
+
+ # "HEADER" removed from pages 1 and 2
+ assert "HEADER" not in all_texts
+ # "Important content" on page 3 preserved (different key)
+ assert "Important content" in all_texts
+ # All body text preserved
+ assert "Body page 1" in all_texts
+ assert "Body page 2" in all_texts
+ assert "Body page 3" in all_texts
+
+ def test_page_without_footer_content_preserved(self):
+ """Symmetric case: a page missing the footer has its body text at
+ the bottom preserved."""
+ doc = _make_doc(
+ _make_page(1, [
+ ("Body 1", 100, 115),
+ ("FOOTER", 760, 775),
+ ]),
+ _make_page(2, [
+ ("Body 2", 100, 115),
+ ("FOOTER", 760, 775),
+ ]),
+ _make_page(3, [
+ ("Body 3", 100, 115),
+ # Different text in footer region
+ ("Final remarks", 760, 775),
+ ]),
+ )
+ config = HeaderFooterConfig(footer_scan_height=50, repeat_threshold=2)
+ detection = detect_repeating_headers_footers(doc, config)
+
+ assert "FOOTER" in detection.footer_keys
+
+ result = strip_detected_headers_footers(doc, detection, config)
+ all_texts = flatten_document_text(result)
+
+ assert "FOOTER" not in all_texts
+ assert "Final remarks" in all_texts
+ assert "Body 1" in all_texts
+ assert "Body 2" in all_texts
+ assert "Body 3" in all_texts
diff --git a/utest/test_spatial_word_sorting.py b/utest/test_spatial_word_sorting.py
new file mode 100644
index 0000000..f716a23
--- /dev/null
+++ b/utest/test_spatial_word_sorting.py
@@ -0,0 +1,611 @@
+"""Unit tests for build_page_structure_from_words() and the spatial_word_sorting config flag."""
+
+import pytest
+
+from DocTest.PdfStructureModels import (
+ DocumentStructure,
+ PageStructure,
+ StructureExtractionConfig,
+ TextBlock,
+ TextLine,
+ build_page_structure,
+ build_page_structure_from_words,
+ flatten_document_words,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helper
+# ---------------------------------------------------------------------------
+
+
+def _make_word_tuple(text, x0, y0, x1, y1, block_no=0, line_no=0, word_no=0):
+ """Return a tuple in PyMuPDF ``get_text('words')`` format.
+
+ Format: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
+ """
+ return (x0, y0, x1, y1, text, block_no, line_no, word_no)
+
+
+# ---------------------------------------------------------------------------
+# 1. Empty / None inputs
+# ---------------------------------------------------------------------------
+
+
+def test_empty_words_list():
+ """Empty input returns PageStructure with no blocks."""
+ page = build_page_structure_from_words(0, [], page_width=612.0, page_height=792.0)
+ assert isinstance(page, PageStructure)
+ assert page.page_number == 0
+ assert page.blocks == []
+ assert page.width == 612.0
+ assert page.height == 792.0
+
+
+def test_none_words_list():
+ """None input returns PageStructure with no blocks."""
+ page = build_page_structure_from_words(0, None, page_width=612.0, page_height=792.0)
+ assert isinstance(page, PageStructure)
+ assert page.blocks == []
+
+
+# ---------------------------------------------------------------------------
+# 2. Single word
+# ---------------------------------------------------------------------------
+
+
+def test_single_word():
+ """One word produces one block with one line."""
+ words = [_make_word_tuple("hello", 10.0, 100.0, 50.0, 112.0)]
+ page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+ assert len(page.blocks) == 1
+ block = page.blocks[0]
+ assert block.line_count == 1
+ assert block.lines[0].text == "hello"
+ assert len(block.lines[0].spans) == 1
+ assert block.lines[0].spans[0].text == "hello"
+
+
+# ---------------------------------------------------------------------------
+# 3. Single line, multiple words
+# ---------------------------------------------------------------------------
+
+
+def test_single_line_multiple_words():
+ """Multiple words at the same Y position produce one line, sorted by x0."""
+ words = [
+ _make_word_tuple("world", 60.0, 100.0, 110.0, 112.0, word_no=1),
+ _make_word_tuple("hello", 10.0, 100.0, 50.0, 112.0, word_no=0),
+ _make_word_tuple("!", 120.0, 100.0, 130.0, 112.0, word_no=2),
+ ]
+ page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+ assert len(page.blocks) == 1
+ assert page.blocks[0].lines[0].text == "hello world !"
+
+
+# ---------------------------------------------------------------------------
+# 4. Multiple lines
+# ---------------------------------------------------------------------------
+
+
+def test_multiple_lines():
+ """Words at different Y positions produce separate lines sorted top-to-bottom."""
+ words = [
+ # Second line (y ~ 200)
+ _make_word_tuple("second", 10.0, 200.0, 80.0, 212.0),
+ # First line (y ~ 100)
+ _make_word_tuple("first", 10.0, 100.0, 60.0, 112.0),
+ # Third line (y ~ 300)
+ _make_word_tuple("third", 10.0, 300.0, 70.0, 312.0),
+ ]
+ page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+ assert len(page.blocks) == 3
+ texts = [b.lines[0].text for b in page.blocks]
+ assert texts == ["first", "second", "third"]
+
+
+# ---------------------------------------------------------------------------
+# 5. Multi-column layout
+# ---------------------------------------------------------------------------
+
+
+def test_multi_column_layout():
+ """Three columns at the same Y range produce words interleaved by Y row.
+
+ This is the key scenario: words from different columns that share the
+ same vertical position should be grouped into the same line, ordered
+ left-to-right.
+ """
+ # Row 1 (y=100..112): three columns
+ words = [
+ _make_word_tuple("C1R1", 10.0, 100.0, 60.0, 112.0),
+ _make_word_tuple("C2R1", 210.0, 100.0, 260.0, 112.0),
+ _make_word_tuple("C3R1", 410.0, 100.0, 460.0, 112.0),
+ # Row 2 (y=130..142): three columns
+ _make_word_tuple("C1R2", 10.0, 130.0, 60.0, 142.0),
+ _make_word_tuple("C2R2", 210.0, 130.0, 260.0, 142.0),
+ _make_word_tuple("C3R2", 410.0, 130.0, 460.0, 142.0),
+ ]
+ page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+ assert len(page.blocks) == 2
+ assert page.blocks[0].lines[0].text == "C1R1 C2R1 C3R1"
+ assert page.blocks[1].lines[0].text == "C1R2 C2R2 C3R2"
+
+
+# ---------------------------------------------------------------------------
+# 6. Mixed font sizes (adaptive tolerance)
+# ---------------------------------------------------------------------------
+
+
+def test_mixed_font_sizes():
+ """Words with different heights at similar Y are grouped using adaptive tolerance.
+
+ Tolerance is min(min_height, word_height) * 0.5. Words that are close
+ enough vertically should be merged into one line.
+ """
+ # Two words with different heights but overlapping Y midpoints.
+ # Word A: height 12, midpoint = 106
+ # Word B: height 20, midpoint = 110
+ # min_height = 12, tolerance = 12 * 0.5 = 6.0
+ # |106 - 110| = 4.0 < 6.0 => same line
+ words = [
+ _make_word_tuple("small", 10.0, 100.0, 60.0, 112.0), # height=12, mid=106
+ _make_word_tuple("big", 70.0, 100.0, 140.0, 120.0), # height=20, mid=110
+ ]
+ page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+ assert len(page.blocks) == 1
+ assert page.blocks[0].lines[0].text == "small big"
+
+
+def test_mixed_font_sizes_separate_lines():
+ """Words whose midpoints differ more than the adaptive tolerance form separate lines."""
+ # Word A: height 10, midpoint = 105
+ # Word B: height 10, midpoint = 120
+ # tolerance = 10 * 0.5 = 5.0
+ # |105 - 120| = 15.0 > 5.0 => different lines
+ words = [
+ _make_word_tuple("line1", 10.0, 100.0, 60.0, 110.0), # mid=105
+ _make_word_tuple("line2", 10.0, 115.0, 60.0, 125.0), # mid=120
+ ]
+ page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+ assert len(page.blocks) == 2
+
+
+# ---------------------------------------------------------------------------
+# 7. Text normalization
+# ---------------------------------------------------------------------------
+
+
+def test_text_normalization_applied():
+ """Whitespace collapsing, ligature normalization, and strip edges all work."""
+ config = StructureExtractionConfig(
+ collapse_whitespace=True,
+ strip_line_edges=True,
+ normalize_ligatures=True,
+ )
+ # "\ufb01" is the fi ligature
+ words = [
+ _make_word_tuple(" hello ", 10.0, 100.0, 60.0, 112.0),
+ _make_word_tuple("\ufb01nd", 70.0, 100.0, 120.0, 112.0),
+ ]
+ page = build_page_structure_from_words(
+ 0, words, config=config, page_width=612.0, page_height=792.0
+ )
+
+ assert len(page.blocks) == 1
+ assert page.blocks[0].lines[0].text == "hello find"
+
+
+# ---------------------------------------------------------------------------
+# 8. Config hash includes spatial_word_sorting
+# ---------------------------------------------------------------------------
+
+
+def test_config_hash_includes_spatial():
+ """Two configs differing only in spatial_word_sorting hash differently."""
+ c1 = StructureExtractionConfig(spatial_word_sorting=False)
+ c2 = StructureExtractionConfig(spatial_word_sorting=True)
+ assert hash(c1) != hash(c2)
+
+
+def test_config_hash_same_when_equal():
+ """Configs with identical settings hash the same."""
+ c1 = StructureExtractionConfig(spatial_word_sorting=True)
+ c2 = StructureExtractionConfig(spatial_word_sorting=True)
+ assert hash(c1) == hash(c2)
+
+
+# ---------------------------------------------------------------------------
+# 9. Page dimensions from explicit args
+# ---------------------------------------------------------------------------
+
+
+def test_page_dimensions_from_args():
+ """page_width and page_height params are used directly."""
+ page = build_page_structure_from_words(
+ 0, [], page_width=500.0, page_height=700.0
+ )
+ assert page.width == 500.0
+ assert page.height == 700.0
+
+
+# ---------------------------------------------------------------------------
+# 10. Page dimensions from image_shape + dpi
+# ---------------------------------------------------------------------------
+
+
+def test_page_dimensions_from_image_shape():
+ """When page_width=0, falls back to image_shape + dpi calculation."""
+ # image_shape: (height_px, width_px, channels)
+ # width = 720 * 72 / 72 = 720.0
+ # height = 1080 * 72 / 72 = 1080.0
+ page = build_page_structure_from_words(
+ 0,
+ [],
+ page_width=0.0,
+ page_height=0.0,
+ dpi=72,
+ image_shape=(1080, 720, 3),
+ )
+ assert page.width == 720.0
+ assert page.height == 1080.0
+
+
+def test_page_dimensions_from_image_shape_with_higher_dpi():
+ """Verify the DPI scaling formula: page_pt = px * 72 / dpi."""
+ # 1440px wide at 144 DPI => 1440 * 72 / 144 = 720 points
+ page = build_page_structure_from_words(
+ 0,
+ [],
+ page_width=0.0,
+ page_height=0.0,
+ dpi=144,
+ image_shape=(2160, 1440, 3),
+ )
+ assert page.width == 720.0
+ assert page.height == 1080.0
+
+
+# ---------------------------------------------------------------------------
+# 11. Drop empty lines
+# ---------------------------------------------------------------------------
+
+
+def test_drop_empty_lines():
+ """Empty words after normalization are dropped when drop_empty_lines=True."""
+ config = StructureExtractionConfig(drop_empty_lines=True, strip_line_edges=True)
+ words = [
+ _make_word_tuple(" ", 10.0, 100.0, 60.0, 112.0), # becomes empty after strip
+ _make_word_tuple("real", 10.0, 200.0, 60.0, 212.0),
+ ]
+ page = build_page_structure_from_words(
+ 0, words, config=config, page_width=612.0, page_height=792.0
+ )
+
+ assert len(page.blocks) == 1
+ assert page.blocks[0].lines[0].text == "real"
+
+
+def test_keep_empty_lines_when_disabled():
+ """When drop_empty_lines=False, whitespace-only words still produce lines."""
+ config = StructureExtractionConfig(
+ drop_empty_lines=False,
+ collapse_whitespace=False,
+ strip_line_edges=False,
+ )
+ words = [
+ _make_word_tuple(" ", 10.0, 100.0, 60.0, 112.0),
+ _make_word_tuple("real", 10.0, 200.0, 60.0, 212.0),
+ ]
+ page = build_page_structure_from_words(
+ 0, words, config=config, page_width=612.0, page_height=792.0
+ )
+
+ assert len(page.blocks) == 2
+
+
+# ---------------------------------------------------------------------------
+# 12. Bbox is union of word bboxes
+# ---------------------------------------------------------------------------
+
+
+def test_bbox_is_union_of_word_bboxes():
+ """Line bbox is the union of all word bboxes in that line."""
+ words = [
+ _make_word_tuple("left", 10.0, 100.0, 50.0, 112.0),
+ _make_word_tuple("right", 200.0, 98.0, 260.0, 115.0),
+ ]
+ page = build_page_structure_from_words(
+ 0, words, config=StructureExtractionConfig(round_precision=None),
+ page_width=612.0, page_height=792.0,
+ )
+
+ assert len(page.blocks) == 1
+ bbox = page.blocks[0].lines[0].bbox
+ # x0 = min(10.0, 200.0) = 10.0
+ assert bbox[0] == 10.0
+ # y0 = min(100.0, 98.0) = 98.0
+ assert bbox[1] == 98.0
+ # x1 = max(50.0, 260.0) = 260.0
+ assert bbox[2] == 260.0
+ # y1 = max(112.0, 115.0) = 115.0
+ assert bbox[3] == 115.0
+
+
+# ---------------------------------------------------------------------------
+# 13. Round precision applied
+# ---------------------------------------------------------------------------
+
+
+def test_round_precision_applied():
+ """Bboxes are rounded per config.round_precision."""
+ words = [
+ _make_word_tuple("word", 10.12345, 100.6789, 50.99999, 112.11111),
+ ]
+ config = StructureExtractionConfig(round_precision=2)
+ page = build_page_structure_from_words(
+ 0, words, config=config, page_width=612.0, page_height=792.0,
+ )
+
+ bbox = page.blocks[0].lines[0].bbox
+ assert bbox == (10.12, 100.68, 51.0, 112.11)
+
+
+def test_round_precision_none_no_rounding():
+ """When round_precision is None, coordinates are not rounded."""
+ words = [
+ _make_word_tuple("word", 10.12345, 100.6789, 50.99999, 112.11111),
+ ]
+ config = StructureExtractionConfig(round_precision=None)
+ page = build_page_structure_from_words(
+ 0, words, config=config, page_width=612.0, page_height=792.0,
+ )
+
+ bbox = page.blocks[0].lines[0].bbox
+ assert bbox == (10.12345, 100.6789, 50.99999, 112.11111)
+
+
+# ---------------------------------------------------------------------------
+# 14. Words sorted left to right within a line
+# ---------------------------------------------------------------------------
+
+
+def test_words_sorted_left_to_right_within_line():
+ """Even if words are added out of order, they come out sorted by x0."""
+ words = [
+ _make_word_tuple("C", 200.0, 100.0, 220.0, 112.0),
+ _make_word_tuple("A", 10.0, 100.0, 30.0, 112.0),
+ _make_word_tuple("B", 100.0, 100.0, 120.0, 112.0),
+ ]
+ page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+ assert len(page.blocks) == 1
+ assert page.blocks[0].lines[0].text == "A B C"
+
+
+# ---------------------------------------------------------------------------
+# 15. Spatial vs block: same simple text
+# ---------------------------------------------------------------------------
+
+
+def test_spatial_vs_block_same_simple_text():
+ """For a simple single-column document, build_page_structure and
+ build_page_structure_from_words produce the same word sequence when flattened.
+ """
+ # Simulate a simple PDF dict for build_page_structure
+ pdf_dict = {
+ "width": 612.0,
+ "height": 792.0,
+ "blocks": [
+ {
+ "type": 0,
+ "bbox": (10.0, 100.0, 200.0, 145.0),
+ "lines": [
+ {
+ "bbox": (10.0, 100.0, 200.0, 112.0),
+ "spans": [
+ {"text": "hello world", "font": "Arial", "size": 12.0}
+ ],
+ },
+ {
+ "bbox": (10.0, 130.0, 200.0, 142.0),
+ "spans": [
+ {"text": "foo bar", "font": "Arial", "size": 12.0}
+ ],
+ },
+ ],
+ }
+ ],
+ }
+
+ # Simulate equivalent word tuples for build_page_structure_from_words
+ word_tuples = [
+ _make_word_tuple("hello", 10.0, 100.0, 50.0, 112.0, 0, 0, 0),
+ _make_word_tuple("world", 55.0, 100.0, 100.0, 112.0, 0, 0, 1),
+ _make_word_tuple("foo", 10.0, 130.0, 40.0, 142.0, 0, 1, 0),
+ _make_word_tuple("bar", 45.0, 130.0, 80.0, 142.0, 0, 1, 1),
+ ]
+
+ config = StructureExtractionConfig()
+ page_block = build_page_structure(0, pdf_dict, config=config)
+ page_spatial = build_page_structure_from_words(
+ 0, word_tuples, config=config, page_width=612.0, page_height=792.0,
+ )
+
+ # Extract words from both
+ def _extract_words(page):
+ words = []
+ for block in page.blocks:
+ for line in block.lines:
+ words.extend(line.text.split())
+ return words
+
+ block_words = _extract_words(page_block)
+ spatial_words = _extract_words(page_spatial)
+ assert block_words == spatial_words
+
+
+# ---------------------------------------------------------------------------
+# 16. Integration with flatten_document_words
+# ---------------------------------------------------------------------------
+
+
+def test_integration_with_flatten_document_words():
+ """Build a DocumentStructure from spatial pages and verify flatten_document_words works."""
+ words_page1 = [
+ _make_word_tuple("page", 10.0, 100.0, 50.0, 112.0),
+ _make_word_tuple("one", 55.0, 100.0, 90.0, 112.0),
+ ]
+ words_page2 = [
+ _make_word_tuple("page", 10.0, 100.0, 50.0, 112.0),
+ _make_word_tuple("two", 55.0, 100.0, 90.0, 112.0),
+ ]
+
+ config = StructureExtractionConfig()
+ page1 = build_page_structure_from_words(
+ 0, words_page1, config=config, page_width=612.0, page_height=792.0,
+ )
+ page2 = build_page_structure_from_words(
+ 1, words_page2, config=config, page_width=612.0, page_height=792.0,
+ )
+
+ doc = DocumentStructure(pages=[page1, page2], config=config)
+
+ flat_words, tokens = flatten_document_words(doc)
+ assert flat_words == ["page", "one", "page", "two"]
+ assert len(tokens) == 4
+ assert tokens[0].source_page == 0
+ assert tokens[2].source_page == 1
+ assert tokens[0].word_index == 0
+ assert tokens[3].word_index == 3
+
+
+# ---------------------------------------------------------------------------
+# 17. Character replacements applied
+# ---------------------------------------------------------------------------
+
+
+def test_character_replacements_applied():
+ """Character replacements are applied to word text during normalization."""
+ config = StructureExtractionConfig(
+ character_replacements={"\u00A0": " ", "\u2013": "-"},
+ )
+ # Non-breaking space within a word, en-dash in another
+ words = [
+ _make_word_tuple("hello\u00A0world", 10.0, 100.0, 100.0, 112.0),
+ _make_word_tuple("2020\u20132021", 110.0, 100.0, 200.0, 112.0),
+ ]
+ page = build_page_structure_from_words(
+ 0, words, config=config, page_width=612.0, page_height=792.0,
+ )
+
+ assert len(page.blocks) == 1
+ line_text = page.blocks[0].lines[0].text
+ # NBSP replaced with space, then words joined
+ # "hello world" becomes two parts after collapse_whitespace: "hello" "world"
+ # so the full text depends on how the joining works
+ assert "\u00A0" not in line_text
+ assert "\u2013" not in line_text
+ assert "2020-2021" in line_text
+
+
+# ---------------------------------------------------------------------------
+# Additional edge cases
+# ---------------------------------------------------------------------------
+
+
+def test_block_index_and_line_index_increment():
+ """Block index and global line index are sequential."""
+ words = [
+ _make_word_tuple("line1", 10.0, 100.0, 60.0, 112.0),
+ _make_word_tuple("line2", 10.0, 200.0, 60.0, 212.0),
+ _make_word_tuple("line3", 10.0, 300.0, 60.0, 312.0),
+ ]
+ page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+ assert len(page.blocks) == 3
+ for i, block in enumerate(page.blocks):
+ assert block.index == i
+ assert block.lines[0].index == i
+
+
+def test_page_number_is_preserved():
+ """The page_number argument is stored in the result."""
+ page = build_page_structure_from_words(42, [], page_width=612.0, page_height=792.0)
+ assert page.page_number == 42
+
+
+def test_block_bbox_equals_line_bbox():
+ """Since each block has exactly one line, the block bbox should match the line bbox."""
+ words = [
+ _make_word_tuple("hello", 10.0, 100.0, 50.0, 112.0),
+ _make_word_tuple("world", 55.0, 100.0, 100.0, 112.0),
+ ]
+ page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+ assert len(page.blocks) == 1
+ assert page.blocks[0].bbox == page.blocks[0].lines[0].bbox
+
+
+def test_line_count_property():
+ """PageStructure.line_count aggregates across all blocks."""
+ words = [
+ _make_word_tuple("a", 10.0, 100.0, 30.0, 112.0),
+ _make_word_tuple("b", 10.0, 200.0, 30.0, 212.0),
+ _make_word_tuple("c", 10.0, 300.0, 30.0, 312.0),
+ ]
+ page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+ assert page.line_count == 3
+
+
+def test_spans_contain_full_line_text():
+ """Each line has exactly one span whose text matches the line text."""
+ words = [
+ _make_word_tuple("alpha", 10.0, 100.0, 60.0, 112.0),
+ _make_word_tuple("beta", 70.0, 100.0, 120.0, 112.0),
+ ]
+ page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+ line = page.blocks[0].lines[0]
+ assert len(line.spans) == 1
+ assert line.spans[0].text == line.text
+ assert line.spans[0].font is None
+ assert line.spans[0].size == 0.0
+
+
+def test_fonts_set_is_empty():
+ """Spatial word extraction does not have font info, so fonts set is empty."""
+ words = [_make_word_tuple("test", 10.0, 100.0, 50.0, 112.0)]
+ page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0)
+
+ assert page.blocks[0].lines[0].fonts == set()
+
+
+def test_whitespace_replacement_used():
+ """The whitespace_replacement from config is used to join words."""
+ config = StructureExtractionConfig(whitespace_replacement="|")
+ words = [
+ _make_word_tuple("a", 10.0, 100.0, 30.0, 112.0),
+ _make_word_tuple("b", 40.0, 100.0, 60.0, 112.0),
+ ]
+ page = build_page_structure_from_words(
+ 0, words, config=config, page_width=612.0, page_height=792.0,
+ )
+
+ assert page.blocks[0].lines[0].text == "a|b"
+
+
+def test_default_config_used_when_none():
+ """When config is None, a default StructureExtractionConfig is used."""
+ words = [_make_word_tuple("hello", 10.0, 100.0, 50.0, 112.0)]
+ page = build_page_structure_from_words(0, words, config=None, page_width=612.0, page_height=792.0)
+
+ assert len(page.blocks) == 1
+ assert page.blocks[0].lines[0].text == "hello"
diff --git a/utest/test_structure_report.py b/utest/test_structure_report.py
new file mode 100644
index 0000000..0b8ef8e
--- /dev/null
+++ b/utest/test_structure_report.py
@@ -0,0 +1,875 @@
+"""Comprehensive unit tests for DocTest.StructureReportBuilder (ADR-003).
+
+Tests cover:
+ - Passing results returning empty strings
+ - Single difference types (missing, extra, mismatch, geometry)
+ - Hunk grouping (adjacent, separated, merge boundary)
+ - Context lines with/without reference_texts
+ - Summary statistics
+ - Document-level and word-level differences
+ - Text truncation
+ - HTML escaping (XSS safety)
+ - Large results with hunk collapse
+ - Metadata rendering
+ - Plain-text report structure
+ - Internal helpers (_classify_diff_type, _group_into_hunks, _escape, _truncate)
+"""
+
+import pytest
+
+from DocTest.PdfStructureComparator import (
+ DocumentTextDifference,
+ DocumentWordDifference,
+ LineDifference,
+ StructureComparisonResult,
+)
+from DocTest.StructureReportBuilder import (
+ MAX_HUNKS_BEFORE_COLLAPSE,
+ MAX_TEXT_DISPLAY_LENGTH,
+ ReportMetadata,
+ ReportSummary,
+ _classify_diff_type,
+ _collect_all_diffs,
+ _compute_summary,
+ _escape,
+ _group_into_hunks,
+ _truncate,
+ build_structure_report,
+ build_structure_report_plain_text,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_passing_result():
+ """Return a StructureComparisonResult with passed=True."""
+ return StructureComparisonResult()
+
+
+def _make_result_with_page_diffs(diffs, page=1):
+ """Return a failing StructureComparisonResult with the given LineDifferences."""
+ result = StructureComparisonResult()
+ for d in diffs:
+ result.add_difference(d)
+ return result
+
+
+def _make_line_diff(diff_type, *, page=1, ref_text=None, cand_text=None,
+ deltas=None, reference_index=None, candidate_index=None,
+ message=None):
+ """Convenience factory for LineDifference."""
+ if message is None:
+ message = f"Synthetic {diff_type}"
+ return LineDifference(
+ page=page,
+ diff_type=diff_type,
+ message=message,
+ ref_text=ref_text,
+ cand_text=cand_text,
+ deltas=deltas,
+ reference_index=reference_index,
+ candidate_index=candidate_index,
+ )
+
+
+# ===========================================================================
+# 1 & 2 - Passing result returns empty string
+# ===========================================================================
+
+
+class TestPassingResult:
+
+ def test_html_report_empty_for_passing_result(self):
+ result = _make_passing_result()
+ assert result.passed is True
+ html = build_structure_report(result)
+ assert html == ""
+
+ def test_plain_report_empty_for_passing_result(self):
+ result = _make_passing_result()
+ plain = build_structure_report_plain_text(result)
+ assert plain == ""
+
+
+# ===========================================================================
+# 3-6 - Single differences
+# ===========================================================================
+
+
+class TestSingleDifferences:
+
+ def test_html_report_single_missing_line(self):
+ diff = _make_line_diff(
+ "missing_line",
+ ref_text="vanished line",
+ reference_index=0,
+ )
+ result = _make_result_with_page_diffs([diff])
+ html = build_structure_report(result)
+
+ assert "#f8d7da" in html, "Missing line should use red background #f8d7da"
+ assert "-" in html, "Missing line should display '-' symbol"
+ assert "vanished line" in html
+
+ def test_html_report_single_extra_line(self):
+ diff = _make_line_diff(
+ "extra_line",
+ cand_text="new line appeared",
+ candidate_index=0,
+ )
+ result = _make_result_with_page_diffs([diff])
+ html = build_structure_report(result)
+
+ assert "#d4edda" in html, "Extra line should use green background #d4edda"
+ assert "+" in html, "Extra line should display '+' symbol"
+ assert "new line appeared" in html
+
+ def test_html_report_single_text_mismatch(self):
+ diff = _make_line_diff(
+ "text_mismatch",
+ ref_text="foo",
+ cand_text="bar",
+ reference_index=0,
+ candidate_index=0,
+ )
+ result = _make_result_with_page_diffs([diff])
+ html = build_structure_report(result)
+
+ assert "#fff3cd" in html, "Text mismatch should use yellow background #fff3cd"
+ assert "ref:" in html, "Text mismatch should show 'ref:' label"
+ assert "cand:" in html, "Text mismatch should show 'cand:' label"
+ assert "foo" in html
+ assert "bar" in html
+
+ def test_html_report_single_geometry_mismatch(self):
+ diff = _make_line_diff(
+ "geometry_mismatch",
+ ref_text="shifted text",
+ deltas={"left": 5.0},
+ reference_index=0,
+ )
+ result = _make_result_with_page_diffs([diff])
+ html = build_structure_report(result)
+
+ assert "#e2e3e5" in html, "Geometry mismatch should use grey background #e2e3e5"
+ # The delta symbol U+0394
+ assert "\u0394" in html or "Δ" in html or "Δ" in html, \
+ "Geometry mismatch should display delta symbol"
+
+
+# ===========================================================================
+# 7-9 - Grouping / Hunks
+# ===========================================================================
+
+
+class TestHunkGrouping:
+
+ def test_adjacent_diffs_grouped_into_one_hunk(self):
+ """5 consecutive LineDifferences at indices 10-14 produce 1 hunk."""
+ diffs = [
+ _make_line_diff("missing_line", ref_text=f"line {i}",
+ reference_index=i)
+ for i in range(10, 15)
+ ]
+ result = _make_result_with_page_diffs(diffs)
+ html = build_structure_report(result)
+
+ assert "Hunk 1" in html
+ assert "Hunk 2" not in html
+
+ def test_separated_diffs_produce_separate_hunks(self):
+ """Diffs at indices 5 and 50 produce two separate hunks."""
+ diff_a = _make_line_diff("missing_line", ref_text="early",
+ reference_index=5)
+ diff_b = _make_line_diff("extra_line", cand_text="late",
+ candidate_index=50)
+ result = _make_result_with_page_diffs([diff_a, diff_b])
+ html = build_structure_report(result)
+
+ assert "Hunk 1" in html
+ assert "Hunk 2" in html
+
+ def test_gap_at_merge_boundary(self):
+ """context_lines=3: merge_threshold = 2*3+1 = 7.
+
+ Diffs at index 10 and 17 (gap=7) -> merged into 1 hunk.
+ Diffs at index 10 and 18 (gap=8) -> 2 separate hunks.
+ """
+ # Gap = 7 => 1 hunk
+ d1 = _make_line_diff("missing_line", ref_text="a", reference_index=10)
+ d2 = _make_line_diff("missing_line", ref_text="b", reference_index=17)
+ result_merged = _make_result_with_page_diffs([d1, d2])
+ html_merged = build_structure_report(result_merged, context_lines=3)
+ assert "Hunk 1" in html_merged
+ assert "Hunk 2" not in html_merged
+
+ # Gap = 8 => 2 hunks
+ d3 = _make_line_diff("missing_line", ref_text="a", reference_index=10)
+ d4 = _make_line_diff("missing_line", ref_text="b", reference_index=18)
+ result_split = _make_result_with_page_diffs([d3, d4])
+ html_split = build_structure_report(result_split, context_lines=3)
+ assert "Hunk 1" in html_split
+ assert "Hunk 2" in html_split
+
+
+# ===========================================================================
+# 10-11 - Context
+# ===========================================================================
+
+
+class TestContext:
+
+ def test_context_shown_when_texts_provided(self):
+ """When reference_texts is provided, context words appear in HTML."""
+ ref_texts = [f"word_{i}" for i in range(20)]
+ diff = _make_line_diff("missing_line", ref_text="word_10",
+ reference_index=10)
+ result = _make_result_with_page_diffs([diff])
+ html = build_structure_report(result, reference_texts=ref_texts,
+ context_lines=3)
+
+ # Context before should include words near index 10
+ assert "word_7" in html or "word_8" in html or "word_9" in html, \
+ "Context before the diff should be visible"
+ # Context after
+ assert "word_11" in html or "word_12" in html or "word_13" in html, \
+ "Context after the diff should be visible"
+
+ def test_no_context_when_texts_not_provided(self):
+ """Without reference_texts, no context divs with '...' appear."""
+ diff = _make_line_diff("missing_line", ref_text="gone",
+ reference_index=10)
+ result = _make_result_with_page_diffs([diff])
+ html = build_structure_report(result, reference_texts=None)
+
+ # The "..." context wrapper should not appear
+ # (the only "..." might come from truncation, but there should be
+ # no context div with the pattern ...word...)
+ assert "color:#999" not in html
+
+
+# ===========================================================================
+# 12-13 - Summary statistics
+# ===========================================================================
+
+
+class TestSummaryStatistics:
+
+ def test_summary_counts_correct(self):
+ """Mix of diff types yields correct ReportSummary counts."""
+ result = StructureComparisonResult()
+ result.add_difference(_make_line_diff("missing_line", ref_text="a",
+ reference_index=0))
+ result.add_difference(_make_line_diff("missing_line", ref_text="b",
+ reference_index=1))
+ result.add_difference(_make_line_diff("extra_line", cand_text="c",
+ candidate_index=2))
+ result.add_difference(_make_line_diff("text_mismatch", ref_text="d",
+ cand_text="e",
+ reference_index=3))
+ result.add_difference(_make_line_diff("geometry_mismatch",
+ ref_text="f",
+ deltas={"left": 1.0},
+ reference_index=4))
+
+ summary = _compute_summary(result)
+
+ assert summary.missing_count == 2
+ assert summary.extra_count == 1
+ assert summary.mismatch_count == 1
+ assert summary.geometry_count == 1
+ assert summary.other_count == 0
+ assert summary.total_differences == 5
+
+ def test_summary_includes_word_diffs(self):
+ """Word differences are counted in summary statistics."""
+ result = StructureComparisonResult()
+ result.add_word_difference(DocumentWordDifference(
+ diff_type="missing_words",
+ message="words gone",
+ ref_words=["hello"],
+ ref_start_index=0,
+ ref_end_index=1,
+ ))
+ result.add_word_difference(DocumentWordDifference(
+ diff_type="extra_words",
+ message="words added",
+ cand_words=["world"],
+ cand_start_index=0,
+ cand_end_index=1,
+ ))
+ result.add_word_difference(DocumentWordDifference(
+ diff_type="word_mismatch",
+ message="words changed",
+ ref_words=["old"],
+ cand_words=["new"],
+ ref_start_index=5,
+ ref_end_index=6,
+ cand_start_index=5,
+ cand_end_index=6,
+ ))
+
+ summary = _compute_summary(result)
+
+ assert summary.missing_count == 1
+ assert summary.extra_count == 1
+ assert summary.mismatch_count == 1
+ assert summary.total_differences == 3
+
+
+# ===========================================================================
+# 14-15 - Document-level and word-level
+# ===========================================================================
+
+
+class TestDocumentAndWordLevel:
+
+ def test_document_level_diffs_in_report(self):
+ """DocumentTextDifference items produce 'Document (text-only)' section."""
+ result = StructureComparisonResult()
+ result.add_document_difference(DocumentTextDifference(
+ diff_type="missing_text",
+ message="Text missing: hello",
+ ref_text="hello",
+ ref_index=0,
+ ))
+
+ html = build_structure_report(result)
+ assert "Document (text-only)" in html
+
+ def test_word_level_diffs_in_report(self):
+ """DocumentWordDifference items produce 'Document (word-level)' section."""
+ result = StructureComparisonResult()
+ result.add_word_difference(DocumentWordDifference(
+ diff_type="word_mismatch",
+ message="Word changed",
+ ref_words=["alpha"],
+ cand_words=["beta"],
+ ref_start_index=0,
+ ref_end_index=1,
+ cand_start_index=0,
+ cand_end_index=1,
+ ))
+
+ html = build_structure_report(result)
+ assert "Document (word-level)" in html
+
+
+# ===========================================================================
+# 16 - Truncation
+# ===========================================================================
+
+
+class TestTruncation:
+
+ def test_long_text_truncated(self):
+ """Diff with 500-char ref_text is truncated in HTML output."""
+ long_text = "x" * 500
+ diff = _make_line_diff("missing_line", ref_text=long_text,
+ reference_index=0)
+ result = _make_result_with_page_diffs([diff])
+ html = build_structure_report(result)
+
+ # The full 500-char text should NOT appear in the report
+ assert long_text not in html
+ # Instead the truncated version with "..." should
+ assert "..." in html
+ # The output should contain at most MAX_TEXT_DISPLAY_LENGTH chars
+ # of the original text (minus 3 for "...")
+ truncated = long_text[:MAX_TEXT_DISPLAY_LENGTH - 3] + "..."
+ assert _escape(truncated) in html
+
+
+# ===========================================================================
+# 17 - HTML safety
+# ===========================================================================
+
+
+class TestHTMLSafety:
+
+ def test_html_special_chars_escaped(self):
+ """XSS payload in diff text is escaped, not rendered raw."""
+ xss = ""
+ diff = _make_line_diff("missing_line", ref_text=xss,
+ reference_index=0)
+ result = _make_result_with_page_diffs([diff])
+ html = build_structure_report(result)
+
+ assert "