diff --git a/DocTest/DocumentRepresentation.py b/DocTest/DocumentRepresentation.py index b4ab734..9ed97e3 100644 --- a/DocTest/DocumentRepresentation.py +++ b/DocTest/DocumentRepresentation.py @@ -17,6 +17,7 @@ PageStructure, StructureExtractionConfig, build_page_structure, + build_page_structure_from_words, ) from DocTest.config import DEFAULT_DPI, OCR_ENGINE_DEFAULT, DEFAULT_CONFIDENCE, MINIMUM_OCR_RESOLUTION, ADD_PIXELS_TO_IGNORE_AREA, TESSERACT_CONFIG import tempfile @@ -197,13 +198,27 @@ def get_pdf_structure(self, config: Optional[StructureExtractionConfig] = None) cached = self._structure_cache.get(config) if cached: return cached - structure = build_page_structure( - page_number=self.page_number, - pdf_dict=self.pdf_text_dict, - config=config, - dpi=self.dpi, - image_shape=self.image.shape, - ) + if config.spatial_word_sorting and self.pdf_text_words: + # Derive page dimensions from the dict if available. + pw = float(self.pdf_text_dict.get("width", 0)) if self.pdf_text_dict else 0.0 + ph = float(self.pdf_text_dict.get("height", 0)) if self.pdf_text_dict else 0.0 + structure = build_page_structure_from_words( + page_number=self.page_number, + pdf_text_words=self.pdf_text_words, + config=config, + page_width=pw, + page_height=ph, + dpi=self.dpi, + image_shape=self.image.shape, + ) + else: + structure = build_page_structure( + page_number=self.page_number, + pdf_dict=self.pdf_text_dict, + config=config, + dpi=self.dpi, + image_shape=self.image.shape, + ) self._structure_cache[config] = structure return structure diff --git a/DocTest/HeaderFooterDetector.py b/DocTest/HeaderFooterDetector.py new file mode 100644 index 0000000..b31534f --- /dev/null +++ b/DocTest/HeaderFooterDetector.py @@ -0,0 +1,230 @@ +"""Repetition-based header/footer detection for PDF structure comparison. + +Scans configurable vertical regions at the top/bottom of each page, identifies +text lines that repeat across multiple pages (with digit normalization for page +numbers), and removes them from the DocumentStructure before comparison. + +This module is a pure-function domain service with no side effects, no Robot +Framework dependency, and no I/O. +""" + +from __future__ import annotations + +import re +from collections import defaultdict +from dataclasses import dataclass +from typing import Dict, FrozenSet, List, Set + +from DocTest.PdfStructureModels import ( + DocumentStructure, + PageStructure, + TextBlock, + TextLine, +) + +__all__ = [ + "HeaderFooterConfig", + "DetectionResult", + "detect_repeating_headers_footers", + "strip_detected_headers_footers", + "filter_headers_footers", +] + +_DIGIT_RUN_RE = re.compile(r"\d+") + + +@dataclass(frozen=True) +class HeaderFooterConfig: + """Configuration for repetition-based header/footer detection.""" + + header_scan_height: float = 0.0 + footer_scan_height: float = 0.0 + repeat_threshold: int = 2 + + @property + def enabled(self) -> bool: + """Return True if at least one scan region is configured.""" + return self.header_scan_height > 0 or self.footer_scan_height > 0 + + +@dataclass(frozen=True) +class DetectionResult: + """Immutable record of which normalized keys were detected as headers/footers.""" + + header_keys: FrozenSet[str] + footer_keys: FrozenSet[str] + + @property + def has_detections(self) -> bool: + return bool(self.header_keys or self.footer_keys) + + +def _normalize_for_grouping(text: str) -> str: + """Replace all digit runs with '#' so page-number variants group together. + + Examples: + "Page 1 of 5" -> "Page # of #" + "ACME Corp" -> "ACME Corp" (no digits, unchanged) + "- 3 -" -> "- # -" + """ + return _DIGIT_RUN_RE.sub("#", text) + + +def detect_repeating_headers_footers( + structure: DocumentStructure, + config: HeaderFooterConfig, +) -> DetectionResult: + """Scan a DocumentStructure and identify text that repeats across pages + in the header/footer regions. + + Args: + structure: The document to scan. + config: Detection parameters (scan heights and threshold). + + Returns: + A DetectionResult containing the normalized keys of detected + header and footer lines. + """ + if not config.enabled: + return DetectionResult(header_keys=frozenset(), footer_keys=frozenset()) + + header_candidates: Dict[str, Set[int]] = defaultdict(set) + footer_candidates: Dict[str, Set[int]] = defaultdict(set) + + for page in structure.pages: + footer_boundary = page.height - config.footer_scan_height + + for block in page.blocks: + for line in block.lines: + text = line.text or "" + if not text: + continue + key = _normalize_for_grouping(text) + + # Check header region + if config.header_scan_height > 0 and line.bbox[1] < config.header_scan_height: + header_candidates[key].add(page.page_number) + + # Check footer region + if config.footer_scan_height > 0 and line.bbox[3] > footer_boundary: + footer_candidates[key].add(page.page_number) + + detected_header_keys = frozenset( + key for key, pages in header_candidates.items() if len(pages) >= config.repeat_threshold + ) + detected_footer_keys = frozenset( + key for key, pages in footer_candidates.items() if len(pages) >= config.repeat_threshold + ) + + return DetectionResult( + header_keys=detected_header_keys, + footer_keys=detected_footer_keys, + ) + + +def strip_detected_headers_footers( + structure: DocumentStructure, + detection: DetectionResult, + config: HeaderFooterConfig, +) -> DocumentStructure: + """Remove detected header/footer lines from a DocumentStructure. + + Only lines that (a) match a detected normalized key AND (b) fall within + the configured scan region are removed. Body lines with identical text + are preserved. + + Args: + structure: The document to filter. + detection: The detection result from detect_repeating_headers_footers(). + config: The same config used for detection (needed for region bounds). + + Returns: + A new DocumentStructure with header/footer lines removed. + """ + if not detection.has_detections: + return structure + + filtered_pages: List[PageStructure] = [] + + for page in structure.pages: + footer_boundary = page.height - config.footer_scan_height + new_blocks: List[TextBlock] = [] + next_line_index = 0 + + for block in page.blocks: + new_lines: List[TextLine] = [] + + for line in block.lines: + text = line.text or "" + key = _normalize_for_grouping(text) + + # Remove if line is in header region AND matches a detected header key + if ( + config.header_scan_height > 0 + and line.bbox[1] < config.header_scan_height + and key in detection.header_keys + ): + continue + + # Remove if line is in footer region AND matches a detected footer key + if ( + config.footer_scan_height > 0 + and line.bbox[3] > footer_boundary + and key in detection.footer_keys + ): + continue + + new_lines.append( + TextLine( + index=next_line_index, + text=text, + bbox=line.bbox, + fonts=set(line.fonts), + spans=list(line.spans), + ) + ) + next_line_index += 1 + + if new_lines: + new_blocks.append( + TextBlock( + index=block.index, + bbox=block.bbox, + lines=new_lines, + ) + ) + + filtered_pages.append( + PageStructure( + page_number=page.page_number, + width=page.width, + height=page.height, + blocks=new_blocks, + ) + ) + + return DocumentStructure(pages=filtered_pages, config=structure.config) + + +def filter_headers_footers( + structure: DocumentStructure, + config: HeaderFooterConfig, +) -> DocumentStructure: + """Convenience function: detect and strip in one call. + + Equivalent to: + detection = detect_repeating_headers_footers(structure, config) + return strip_detected_headers_footers(structure, detection, config) + + Args: + structure: The document to process. + config: Detection parameters. + + Returns: + A new DocumentStructure with detected repeating headers/footers removed. + If config.enabled is False, returns the input unchanged. + """ + if not config.enabled: + return structure + detection = detect_repeating_headers_footers(structure, config) + return strip_detected_headers_footers(structure, detection, config) diff --git a/DocTest/PdfStructureComparator.py b/DocTest/PdfStructureComparator.py index 54db9cd..1cc3a05 100644 --- a/DocTest/PdfStructureComparator.py +++ b/DocTest/PdfStructureComparator.py @@ -4,16 +4,18 @@ from dataclasses import dataclass, field from typing import Dict, Iterable, List, Optional, Sequence, Tuple -from DocTest.PdfStructureModels import DocumentStructure, PageStructure, TextLine +from DocTest.PdfStructureModels import DocumentStructure, PageStructure, TextLine, WordToken __all__ = [ "StructureTolerance", "LineDifference", "DocumentTextDifference", + "DocumentWordDifference", "StructureComparisonResult", "compare_document_structures", "compare_document_text_only", + "compare_document_words", ] @@ -54,6 +56,20 @@ class DocumentTextDifference: cand_index: Optional[int] = None +@dataclass +class DocumentWordDifference: + """Details about word-level content mismatch in page-agnostic comparison.""" + + diff_type: str # "missing_words", "extra_words", "word_mismatch" + message: str + ref_words: Optional[List[str]] = None + cand_words: Optional[List[str]] = None + ref_start_index: Optional[int] = None + ref_end_index: Optional[int] = None + cand_start_index: Optional[int] = None + cand_end_index: Optional[int] = None + + @dataclass class StructureComparisonResult: """Aggregate differences found during structure comparison.""" @@ -61,6 +77,7 @@ class StructureComparisonResult: passed: bool = True page_differences: Dict[int, List[LineDifference]] = field(default_factory=dict) document_differences: List[DocumentTextDifference] = field(default_factory=list) + word_differences: List[DocumentWordDifference] = field(default_factory=list) summary: List[str] = field(default_factory=list) def add_difference(self, diff: LineDifference): @@ -72,13 +89,18 @@ def add_document_difference(self, diff: DocumentTextDifference): self.passed = False self.document_differences.append(diff) + def add_word_difference(self, diff: DocumentWordDifference): + """Add a document-level word difference.""" + self.passed = False + self.word_differences.append(diff) + def extend_summary(self, message: str): self.summary.append(message) def difference_count(self) -> int: - """Return total count of all differences (page-level and document-level).""" + """Return total count of all differences (page-level, document-level, and word-level).""" page_diff_count = sum(len(diffs) for diffs in self.page_differences.values()) - return page_diff_count + len(self.document_differences) + return page_diff_count + len(self.document_differences) + len(self.word_differences) def compare_document_structures( @@ -229,6 +251,210 @@ def compare_document_text_only( return result +def _compare_words_unordered( + ref_words: List[str], + ref_originals: List[str], + cand_words: List[str], + cand_originals: List[str], +) -> StructureComparisonResult: + """Compare words using bag-of-words (Counter-based) comparison. + + This mode ignores word order entirely and only checks that both documents + contain the same words with the same frequencies. It is useful when text + reflows across pages cause identical content to appear in different order. + + Excess words in the reference are reported as ``missing_words``, excess + words in the candidate as ``extra_words``. + """ + from collections import Counter + + result = StructureComparisonResult() + + ref_counts = Counter(ref_words) + cand_counts = Counter(cand_words) + + # Words that appear more in reference than candidate (missing from candidate) + ref_excess = ref_counts - cand_counts + # Words that appear more in candidate than reference (extra in candidate) + cand_excess = cand_counts - ref_counts + + # Build original-text lists for reporting by scanning the original arrays + # and picking up excess instances + if ref_excess: + remaining = dict(ref_excess) + excess_originals: List[str] = [] + for norm_word, orig_word in zip(ref_words, ref_originals): + if remaining.get(norm_word, 0) > 0: + excess_originals.append(orig_word) + remaining[norm_word] -= 1 + if excess_originals: + preview = " ".join(excess_originals[:10]) + if len(excess_originals) > 10: + preview += f" ... (+{len(excess_originals) - 10} more)" + result.add_word_difference( + DocumentWordDifference( + diff_type="missing_words", + message=f"Words in reference not found in candidate (unordered): '{_truncate_text(preview, 120)}'", + ref_words=excess_originals, + ref_start_index=0, + ref_end_index=len(excess_originals), + ) + ) + + if cand_excess: + remaining_cand = dict(cand_excess) + cand_excess_originals: List[str] = [] + for norm_word, orig_word in zip(cand_words, cand_originals): + if remaining_cand.get(norm_word, 0) > 0: + cand_excess_originals.append(orig_word) + remaining_cand[norm_word] -= 1 + if cand_excess_originals: + preview = " ".join(cand_excess_originals[:10]) + if len(cand_excess_originals) > 10: + preview += f" ... (+{len(cand_excess_originals) - 10} more)" + result.add_word_difference( + DocumentWordDifference( + diff_type="extra_words", + message=f"Extra words in candidate not found in reference (unordered): '{_truncate_text(preview, 120)}'", + cand_words=cand_excess_originals, + cand_start_index=0, + cand_end_index=len(cand_excess_originals), + ) + ) + + return result + + +def compare_document_words( + reference: DocumentStructure, + candidate: DocumentStructure, + *, + case_sensitive: bool = True, + normalize_ligatures: bool = False, + normalize_word_boundaries: bool = False, + compare_order: str = "ordered", +) -> StructureComparisonResult: + """Compare document text at the word level, ignoring line and page boundaries. + + Flattens all text into word tokens and uses SequenceMatcher to detect + insertions, deletions, and replacements at word granularity. Contiguous + diff opcodes of the same type are grouped into single difference records + for cleaner reporting. + + Args: + reference: The reference document structure. + candidate: The candidate document structure to compare. + case_sensitive: Whether word comparison is case-sensitive. + normalize_ligatures: When True, replace known typographic ligatures + with their ASCII equivalents in each word before comparison. + normalize_word_boundaries: When True, merge tokens that were split + across line boundaries by connector characters (``/``, ``-``, ``\\``). + compare_order: Comparison strategy. ``"ordered"`` (default) uses + SequenceMatcher for sequence-sensitive comparison. ``"unordered"`` + uses Counter-based bag-of-words comparison that ignores word order, + useful when text reflows across pages. + + Returns: + A StructureComparisonResult with document-level word differences. + """ + from DocTest.PdfStructureModels import flatten_document_words + + result = StructureComparisonResult() + + ref_words, ref_tokens = flatten_document_words( + reference, + normalize_word_boundaries=normalize_word_boundaries, + normalize_ligatures_in_words=normalize_ligatures, + ) + cand_words, cand_tokens = flatten_document_words( + candidate, + normalize_word_boundaries=normalize_word_boundaries, + normalize_ligatures_in_words=normalize_ligatures, + ) + + # Preserve originals for reporting before potential case normalization + ref_originals = list(ref_words) + cand_originals = list(cand_words) + + if not case_sensitive: + ref_words = [w.lower() for w in ref_words] + cand_words = [w.lower() for w in cand_words] + + if compare_order == "unordered": + return _compare_words_unordered(ref_words, ref_originals, cand_words, cand_originals) + + matcher = difflib.SequenceMatcher(a=ref_words, b=cand_words, autojunk=False) + + for tag, i1, i2, j1, j2 in matcher.get_opcodes(): + if tag == "equal": + continue + + ref_slice = ref_originals[i1:i2] if i1 < i2 else None + cand_slice = cand_originals[j1:j2] if j1 < j2 else None + + if tag == "replace": + ref_preview = " ".join(ref_slice) if ref_slice else "" + cand_preview = " ".join(cand_slice) if cand_slice else "" + message = ( + f"Word mismatch at positions {i1}-{i2 - 1}: " + f"reference='{_truncate_text(ref_preview, 80)}', " + f"candidate='{_truncate_text(cand_preview, 80)}'" + ) + result.add_word_difference( + DocumentWordDifference( + diff_type="word_mismatch", + message=message, + ref_words=ref_slice, + cand_words=cand_slice, + ref_start_index=i1, + ref_end_index=i2, + cand_start_index=j1, + cand_end_index=j2, + ) + ) + + elif tag == "delete": + ref_preview = " ".join(ref_slice) if ref_slice else "" + message = ( + f"Words missing in candidate at positions {i1}-{i2 - 1}: " + f"'{_truncate_text(ref_preview, 80)}'" + ) + result.add_word_difference( + DocumentWordDifference( + diff_type="missing_words", + message=message, + ref_words=ref_slice, + ref_start_index=i1, + ref_end_index=i2, + ) + ) + + elif tag == "insert": + cand_preview = " ".join(cand_slice) if cand_slice else "" + message = ( + f"Extra words in candidate at positions {j1}-{j2 - 1}: " + f"'{_truncate_text(cand_preview, 80)}'" + ) + result.add_word_difference( + DocumentWordDifference( + diff_type="extra_words", + message=message, + cand_words=cand_slice, + cand_start_index=j1, + cand_end_index=j2, + ) + ) + + return result + + +def _truncate_text(text: str, max_length: int) -> str: + """Truncate text with ellipsis if it exceeds max_length.""" + if len(text) <= max_length: + return text + return text[: max_length - 3] + "..." + + def _compare_page( ref_page: PageStructure, cand_page: PageStructure, diff --git a/DocTest/PdfStructureModels.py b/DocTest/PdfStructureModels.py index 8bfa6ef..153ae61 100644 --- a/DocTest/PdfStructureModels.py +++ b/DocTest/PdfStructureModels.py @@ -13,11 +13,14 @@ "PageStructure", "DocumentStructure", "StructureExtractionConfig", + "WordToken", "strip_font_subset", "collapse_whitespace", "round_bbox", "build_page_structure", + "build_page_structure_from_words", "flatten_document_text", + "flatten_document_words", ] @@ -80,6 +83,7 @@ class StructureExtractionConfig: round_precision: Optional[int] = 3 normalize_ligatures: bool = False character_replacements: Optional[Dict[str, str]] = None + spatial_word_sorting: bool = False def __hash__(self) -> int: # Allow usage as dictionary key for caching. # Convert character_replacements dict to a hashable tuple of sorted items @@ -98,6 +102,7 @@ def __hash__(self) -> int: # Allow usage as dictionary key for caching. self.round_precision, self.normalize_ligatures, replacements_hash, + self.spatial_word_sorting, ) ) @@ -114,6 +119,15 @@ def page_count(self) -> int: return len(self.pages) +@dataclass(frozen=True) +class WordToken: + """A single word token extracted from a document, with provenance metadata.""" + text: str + source_page: int + source_line_index: int + word_index: int + + def flatten_document_text(structure: DocumentStructure) -> List[str]: """Extract all text lines from a document in reading order, ignoring page boundaries. @@ -137,6 +151,77 @@ def flatten_document_text(structure: DocumentStructure) -> List[str]: return texts +def flatten_document_words( + structure: DocumentStructure, + *, + normalize_word_boundaries: bool = False, + normalize_ligatures_in_words: bool = False, +) -> Tuple[List[str], List[WordToken]]: + """Extract all words from a document in reading order, ignoring page/line boundaries. + + Splits every text line on whitespace to produce individual word tokens. + This enables comparison at word granularity, making the comparison resilient + to text reflow caused by font or layout changes. + + Args: + structure: A DocumentStructure containing pages with text blocks and lines. + normalize_word_boundaries: When True, merge tokens that were split + across line boundaries by connector characters (``/``, ``-``, ``\\``). + normalize_ligatures_in_words: When True, replace known typographic + ligatures with their ASCII equivalents in each word. + + Returns: + A tuple of: + - words: Flat list of word strings for use with SequenceMatcher. + - tokens: Corresponding list of WordToken objects with provenance. + """ + words: List[str] = [] + tokens: List[WordToken] = [] + global_line_index = 0 + word_index = 0 + + for page in structure.pages: + for block in page.blocks: + for line in block.lines: + if not line.text: + global_line_index += 1 + continue + line_words = line.text.split() + for w in line_words: + words.append(w) + tokens.append( + WordToken( + text=w, + source_page=page.page_number, + source_line_index=global_line_index, + word_index=word_index, + ) + ) + word_index += 1 + global_line_index += 1 + + # Apply ligature normalization to individual words if requested + if normalize_ligatures_in_words: + from DocTest.TextNormalization import normalize_ligatures + words = [normalize_ligatures(w) for w in words] + tokens = [ + WordToken( + text=normalize_ligatures(t.text), + source_page=t.source_page, + source_line_index=t.source_line_index, + word_index=t.word_index, + ) + for t in tokens + ] + + # Merge words split across line boundaries + if normalize_word_boundaries: + from DocTest.TextNormalization import merge_split_words + words, tokens = merge_split_words(words, tokens) + + return words, tokens + + def strip_font_subset(font_name: Optional[str]) -> Optional[str]: """Drop random subset prefixes inserted by PDF generators.""" @@ -282,3 +367,151 @@ def build_page_structure( height=height, blocks=blocks, ) + + +def build_page_structure_from_words( + page_number: int, + pdf_text_words: Optional[List], + config: Optional[StructureExtractionConfig] = None, + *, + page_width: float = 0.0, + page_height: float = 0.0, + dpi: Optional[int] = None, + image_shape: Optional[Tuple[int, int, int]] = None, +) -> PageStructure: + """Build a ``PageStructure`` from PyMuPDF ``get_text("words")`` output. + + This bypasses block-level extraction entirely, grouping individual word + bounding boxes into lines using adaptive Y-proximity. The result is + immune to block fragmentation caused by different PDF generators. + + Each word tuple from PyMuPDF has the form:: + + (x0, y0, x1, y1, "word", block_no, line_no, word_no) + + Words are grouped into lines when their vertical midpoints are within + half the minimum word height of each other. Within each line, words + are ordered left-to-right by ``x0``. Lines are ordered top-to-bottom. + Each line becomes its own ``TextBlock`` (single-line blocks). + + Args: + page_number: Zero-based page index. + pdf_text_words: List of word tuples from ``page.get_text("words")``. + config: Normalization settings (whitespace, ligatures, etc.). + page_width: Page width in points. + page_height: Page height in points. + dpi: Optional DPI for computing page dimensions from ``image_shape``. + image_shape: ``(height, width, channels)`` array shape, used with *dpi* + to derive page dimensions when ``page_width``/``page_height`` are zero. + + Returns: + A ``PageStructure`` with one block per reconstructed text line. + """ + config = config or StructureExtractionConfig() + + width = page_width + height = page_height + if (width == 0.0 or height == 0.0) and image_shape and dpi: + px_height, px_width = image_shape[:2] + width = px_width * 72.0 / dpi + height = px_height * 72.0 / dpi + + if not pdf_text_words: + return PageStructure( + page_number=page_number, + width=width, + height=height, + blocks=[], + ) + + # --- Group words into visual lines by Y-proximity --- + # Sort by vertical midpoint first, then horizontal position. + sorted_words = sorted(pdf_text_words, key=lambda w: ((w[1] + w[3]) / 2.0, w[0])) + + lines: List[List] = [] # Each element: list of word tuples + line_y_mid: List[float] = [] # Representative Y midpoint per line + line_min_height: List[float] = [] # Cached minimum word height per line + + for word in sorted_words: + w_y0, w_y1 = float(word[1]), float(word[3]) + w_mid = (w_y0 + w_y1) / 2.0 + w_height = max(w_y1 - w_y0, 1.0) + + # Search backward from most recent line (words are Y-sorted, so the + # most recent line is the most likely match). Break early once we + # move past the tolerance range. + merged = False + max_possible_tolerance = w_height * 0.5 + for idx in range(len(lines) - 1, -1, -1): + ly_mid = line_y_mid[idx] + delta = abs(w_mid - ly_mid) + if delta > max_possible_tolerance and w_mid > ly_mid: + break # Past tolerance; earlier lines are even further away. + tolerance = min(line_min_height[idx], w_height) * 0.5 + if delta <= tolerance: + lines[idx].append(word) + n = len(lines[idx]) + line_y_mid[idx] = ly_mid + (w_mid - ly_mid) / n + if w_height < line_min_height[idx]: + line_min_height[idx] = w_height + merged = True + break + if not merged: + lines.append([word]) + line_y_mid.append(w_mid) + line_min_height.append(w_height) + + # Sort lines top-to-bottom by midpoint, words left-to-right within each. + indexed_lines = sorted(enumerate(lines), key=lambda pair: line_y_mid[pair[0]]) + + blocks: List[TextBlock] = [] + global_line_index = 0 + block_index = 0 + + for _orig_idx, line_words in indexed_lines: + line_words_sorted = sorted(line_words, key=lambda w: float(w[0])) + + # Build text from words, applying normalization. + text_parts: List[str] = [] + for w in line_words_sorted: + raw = str(w[4]) + normalized = _sanitize_span_text(raw, config) + if normalized: + text_parts.append(normalized) + + line_text = config.whitespace_replacement.join(text_parts) if text_parts else "" + if config.strip_line_edges: + line_text = line_text.strip() + if config.drop_empty_lines and not line_text: + continue + + # Compute line bbox as union of all word bboxes. + x0 = min(float(w[0]) for w in line_words_sorted) + y0 = min(float(w[1]) for w in line_words_sorted) + x1 = max(float(w[2]) for w in line_words_sorted) + y1 = max(float(w[3]) for w in line_words_sorted) + bbox = round_bbox((x0, y0, x1, y1), config.round_precision) + + text_line = TextLine( + index=global_line_index, + text=line_text, + bbox=bbox, + fonts=set(), + spans=[TextSpan(text=line_text, font=None, size=0.0)], + ) + blocks.append( + TextBlock( + index=block_index, + bbox=bbox, + lines=[text_line], + ) + ) + global_line_index += 1 + block_index += 1 + + return PageStructure( + page_number=page_number, + width=width, + height=height, + blocks=blocks, + ) diff --git a/DocTest/PdfTest.py b/DocTest/PdfTest.py index d617fd7..8f07624 100644 --- a/DocTest/PdfTest.py +++ b/DocTest/PdfTest.py @@ -15,7 +15,9 @@ StructureTolerance, compare_document_structures, compare_document_text_only, + compare_document_words, ) +from DocTest.HeaderFooterDetector import HeaderFooterConfig, filter_headers_footers from DocTest.PdfStructureModels import ( DocumentStructure, PageStructure, @@ -226,6 +228,10 @@ def compare_pdf_documents(self, reference_document, candidate_document, **kwargs mask_value = kwargs.pop('mask', None) text_mask_patterns_arg = kwargs.pop('text_mask_patterns', None) ignore_ligatures = _as_bool(kwargs.pop('ignore_ligatures', False)) + normalize_word_boundaries = _as_bool(kwargs.pop('normalize_word_boundaries', False), False) + compare_order = kwargs.pop('compare_order', 'ordered') + if compare_order not in ('ordered', 'unordered'): + compare_order = 'ordered' check_pdf_text = _as_bool(kwargs.pop('check_pdf_text', False)) # Parse character_replacements from kwargs or use instance default @@ -267,9 +273,19 @@ def compare_pdf_documents(self, reference_document, candidate_document, **kwargs # New parameters for controlling structure comparison behavior ignore_page_boundaries = _as_bool(kwargs.pop('ignore_page_boundaries', False), False) + compare_word_level = _as_bool(kwargs.pop('compare_word_level', True), True) check_geometry = _as_bool(kwargs.pop('check_geometry', True), True) check_block_count = _as_bool(kwargs.pop('check_block_count', True), True) + header_scan_height = _as_float(kwargs.pop('header_scan_height', 0), 0) + footer_scan_height = _as_float(kwargs.pop('footer_scan_height', 0), 0) + header_repeat_threshold = int(_as_float(kwargs.pop('header_repeat_threshold', 2), 2)) + header_footer_config = HeaderFooterConfig( + header_scan_height=header_scan_height, + footer_scan_height=footer_scan_height, + repeat_threshold=header_repeat_threshold, + ) + # When ignoring page boundaries, disable geometry and block count checks if ignore_page_boundaries: check_geometry = False @@ -456,29 +472,29 @@ def _record_diff(facet: str, description: str, diff_payload: Any): candidate_representation=candidate_repr, text_mask_patterns=compiled_text_patterns, ignore_page_boundaries=ignore_page_boundaries, + compare_word_level=compare_word_level, check_geometry=check_geometry, check_block_count=check_block_count, + header_footer_config=header_footer_config, + normalize_word_boundaries=normalize_word_boundaries, + compare_order=compare_order, ) if not structure_result.passed: differences_detected = True summary = getattr(structure_result, "summary", None) page_diffs = getattr(structure_result, "page_differences", None) doc_diffs = getattr(structure_result, "document_differences", None) - details_parts: List[str] = [] - if summary: - details_parts.extend(str(item) for item in summary) - if page_diffs: - for page, diffs in page_diffs.items(): - for diff in diffs: - details_parts.append(f"Page {page}: {diff.message}") - if doc_diffs: - for diff in doc_diffs: - details_parts.append(f"Document: {diff.message}") + try: + from DocTest.StructureReportBuilder import build_structure_report_plain_text + plain_report = build_structure_report_plain_text(structure_result) + detail_text = plain_report if plain_report else "Structure comparison differences detected." + except Exception: + detail_text = "Structure comparison differences detected." llm_differences.append( { "facet": "structure", "description": "PDF structural comparison failed.", - "details": "\n".join(details_parts) if details_parts else "Structure comparison differences detected.", + "details": detail_text, } ) @@ -557,8 +573,11 @@ def compare_pdf_structure(self, reference_document, candidate_document, **kwargs - ``text_mask_patterns``: regex or list of regex strings to skip lines during comparison. - ``ignore_ligatures`` (bool, default ``False``): normalise common ligatures (``fi`` → ``fi``) prior to comparison. - ``ignore_page_boundaries`` (bool, default ``False``): ignore page breaks and compare text content in reading order across the entire document. When enabled, geometry and block structure are not checked. Useful when font/size changes cause text to reflow across pages. + - ``normalize_word_boundaries`` (bool, default ``False``): merge words split across line boundaries by connector characters (``/``, ``-``, ``\\``). Recommended when using ``ignore_page_boundaries``. + - ``compare_order`` (str, default ``"ordered"``): comparison strategy for word-level comparison. ``"ordered"`` uses sequence-sensitive matching; ``"unordered"`` uses bag-of-words frequency comparison that ignores word order, useful when text reflows across pages. - ``check_geometry`` (bool, default ``True``): when ``False``, skip line position/size comparison. Useful for comparing content when layout may differ. Automatically set to ``False`` when ``ignore_page_boundaries`` is ``True``. - ``check_block_count`` (bool, default ``True``): when ``False``, skip block count validation per page. Automatically set to ``False`` when ``ignore_page_boundaries`` is ``True``. + - ``spatial_word_sorting`` (bool, default ``False``): when ``True``, build page structure from individual word bounding boxes instead of text blocks. This bypasses block fragmentation differences caused by different PDF generators and produces consistent word ordering. Recommended when ``ignore_page_boundaries`` is ``True``. Examples: | `Compare Pdf Structure` reference.pdf candidate.pdf @@ -566,6 +585,7 @@ def compare_pdf_structure(self, reference_document, candidate_document, **kwargs | `Compare Pdf Structure` reference.pdf candidate.pdf mask=${CURDIR}${/}mask.json text_mask_patterns=\\d{4}-\\d{4} ignore_ligatures=${True} | `Compare Pdf Structure` reference.pdf candidate.pdf ignore_page_boundaries=${True} | `Compare Pdf Structure` reference.pdf candidate.pdf check_geometry=${False} check_block_count=${False} + | `Compare Pdf Structure` reference.pdf candidate.pdf ignore_page_boundaries=${True} spatial_word_sorting=${True} | `Run Keyword And Expect Error` The compared PDF structure is different. Compare Pdf Structure reference.pdf candidate_with_changed_text.pdf """ @@ -589,6 +609,10 @@ def compare_pdf_structure(self, reference_document, candidate_document, **kwargs mask_value = kwargs.get('mask') text_mask_patterns_arg = kwargs.get('text_mask_patterns') ignore_ligatures = _as_bool(kwargs.get('ignore_ligatures', False), False) + normalize_word_boundaries = _as_bool(kwargs.get('normalize_word_boundaries', False), False) + compare_order = kwargs.get('compare_order', 'ordered') + if compare_order not in ('ordered', 'unordered'): + compare_order = 'ordered' # Parse character_replacements from kwargs or use instance default char_replacements_arg = kwargs.get('character_replacements') @@ -598,8 +622,19 @@ def compare_pdf_structure(self, reference_document, candidate_document, **kwargs # New parameters for controlling comparison behavior ignore_page_boundaries = _as_bool(kwargs.get('ignore_page_boundaries', False), False) + compare_word_level = _as_bool(kwargs.get('compare_word_level', True), True) check_geometry = _as_bool(kwargs.get('check_geometry', True), True) check_block_count = _as_bool(kwargs.get('check_block_count', True), True) + spatial_word_sorting = _as_bool(kwargs.get('spatial_word_sorting', False), False) + + header_scan_height = _as_float(kwargs.get('header_scan_height', 0), 0) + footer_scan_height = _as_float(kwargs.get('footer_scan_height', 0), 0) + header_repeat_threshold = int(_as_float(kwargs.get('header_repeat_threshold', 2), 2)) + header_footer_config = HeaderFooterConfig( + header_scan_height=header_scan_height, + footer_scan_height=footer_scan_height, + repeat_threshold=header_repeat_threshold, + ) # When ignoring page boundaries, disable geometry and block count checks if ignore_page_boundaries: @@ -615,6 +650,7 @@ def compare_pdf_structure(self, reference_document, candidate_document, **kwargs round_precision=round_precision, normalize_ligatures=ignore_ligatures, character_replacements=char_replacements, + spatial_word_sorting=spatial_word_sorting, ) tolerance = StructureTolerance( position=position_tolerance, @@ -655,8 +691,12 @@ def compare_pdf_structure(self, reference_document, candidate_document, **kwargs candidate_representation=candidate_repr, text_mask_patterns=compiled_text_patterns, ignore_page_boundaries=ignore_page_boundaries, + compare_word_level=compare_word_level, check_geometry=check_geometry, check_block_count=check_block_count, + header_footer_config=header_footer_config, + normalize_word_boundaries=normalize_word_boundaries, + compare_order=compare_order, ) finally: reference_repr.close() @@ -969,8 +1009,12 @@ def _perform_structure_comparison( candidate_representation: Optional[DocumentRepresentation] = None, text_mask_patterns: Optional[List[Pattern[str]]] = None, ignore_page_boundaries: bool = False, + compare_word_level: bool = True, check_geometry: bool = True, check_block_count: bool = True, + header_footer_config: Optional["HeaderFooterConfig"] = None, + normalize_word_boundaries: bool = False, + compare_order: str = "ordered", ): release_reference = False release_candidate = False @@ -985,17 +1029,31 @@ def _perform_structure_comparison( reference_structure = reference_representation.get_pdf_structure(config=extraction_config) candidate_structure = candidate_representation.get_pdf_structure(config=extraction_config) + # Repetition-based header/footer detection + if header_footer_config and header_footer_config.enabled: + reference_structure = filter_headers_footers(reference_structure, header_footer_config) + candidate_structure = filter_headers_footers(candidate_structure, header_footer_config) + if text_mask_patterns: reference_structure = self._prune_structure_lines(reference_structure, text_mask_patterns) candidate_structure = self._prune_structure_lines(candidate_structure, text_mask_patterns) if ignore_page_boundaries: - # Use text-only comparison that ignores page boundaries - result = compare_document_text_only( - reference=reference_structure, - candidate=candidate_structure, - case_sensitive=case_sensitive, - ) + if compare_word_level: + result = compare_document_words( + reference=reference_structure, + candidate=candidate_structure, + case_sensitive=case_sensitive, + normalize_ligatures=extraction_config.normalize_ligatures, + normalize_word_boundaries=normalize_word_boundaries, + compare_order=compare_order, + ) + else: + result = compare_document_text_only( + reference=reference_structure, + candidate=candidate_structure, + case_sensitive=case_sensitive, + ) else: # Use standard page-by-page comparison result = compare_document_structures( @@ -1006,7 +1064,45 @@ def _perform_structure_comparison( check_geometry=check_geometry, check_block_count=check_block_count, ) - self._log_structure_result(result, ignore_page_boundaries=ignore_page_boundaries) + # Capture text lists for context display in the report + ref_texts = None + cand_texts = None + try: + from DocTest.PdfStructureModels import flatten_document_text + ref_texts = flatten_document_text(reference_structure) + cand_texts = flatten_document_text(candidate_structure) + except Exception: + pass + + exclusions = [] + if text_mask_patterns: + exclusions.extend(f"text_mask: {p.pattern}" for p in text_mask_patterns) + if header_footer_config and header_footer_config.enabled: + if header_footer_config.header_scan_height > 0: + exclusions.append(f"header_filter: {header_footer_config.header_scan_height}pt") + if header_footer_config.footer_scan_height > 0: + exclusions.append(f"footer_filter: {header_footer_config.footer_scan_height}pt") + # Only report disabled checks when explicitly set by the user, + # not when auto-disabled by ignore_page_boundaries + if not ignore_page_boundaries: + if not check_geometry: + exclusions.append("check_geometry: False") + if not check_block_count: + exclusions.append("check_block_count: False") + if normalize_word_boundaries: + exclusions.append("normalize_word_boundaries: True") + if compare_order == "unordered": + exclusions.append("compare_order: unordered") + + self._log_structure_result( + result, + ignore_page_boundaries=ignore_page_boundaries, + reference_name=Path(reference_document).name, + candidate_name=Path(candidate_document).name, + reference_texts=ref_texts, + candidate_texts=cand_texts, + exclusions_applied=exclusions, + ) return result finally: if release_reference: @@ -1060,12 +1156,23 @@ def _prune_structure_lines( ) return DocumentStructure(pages=filtered_pages, config=structure.config) - def _log_structure_result(self, result, ignore_page_boundaries: bool = False): - """Log comparison results with single summary WARN and detail INFO messages. + def _log_structure_result( + self, + result, + *, + ignore_page_boundaries: bool = False, + reference_name: str = "", + candidate_name: str = "", + reference_texts: Optional[List[str]] = None, + candidate_texts: Optional[List[str]] = None, + exclusions_applied: Optional[List[str]] = None, + ): + """Log comparison results with single summary WARN, HTML report INFO, and detail DEBUG. Robot Framework displays WARN messages at the top of log.html. To avoid - cluttering that section, we emit a single summary warning and log all - individual differences as INFO (visible only within keyword output). + cluttering that section, we emit a single summary warning. All differences + are rendered as a single consolidated HTML report at INFO level. Individual + per-difference output is preserved at DEBUG level for troubleshooting. """ if result.passed: logger.info("[PDF Structure] Documents match within configured tolerances.") @@ -1075,15 +1182,34 @@ def _log_structure_result(self, result, ignore_page_boundaries: bool = False): diff_count = result.difference_count() mode = "text-only (ignoring page boundaries)" if ignore_page_boundaries else "structure" - # Single summary warning (appears at top of log.html) + # Single summary warning (appears at top of log.html) -- UNCHANGED logger.warn(f"[PDF Structure] Comparison failed: {diff_count} difference(s) found in {mode} comparison.") - # Log summary entries as INFO + # --- Consolidated HTML report at INFO level --- + try: + from DocTest.StructureReportBuilder import ReportMetadata, build_structure_report + metadata = ReportMetadata( + reference_name=reference_name or "(unknown)", + candidate_name=candidate_name or "(unknown)", + comparison_mode=mode, + exclusions_applied=exclusions_applied or [], + ) + html_report = build_structure_report( + result, + metadata=metadata, + reference_texts=reference_texts, + candidate_texts=candidate_texts, + ) + if html_report: + logger.info(html_report, html=True) + except Exception: + pass # Degrade gracefully if report builder fails + + # --- Per-difference output at DEBUG level --- if result.summary: for entry in result.summary: - logger.info(f"[PDF Structure] {entry}") + logger.debug(f"[PDF Structure] {entry}") - # Log page differences as INFO if result.page_differences: for page in sorted(result.page_differences.keys()): for diff in result.page_differences[page]: @@ -1095,12 +1221,11 @@ def _log_structure_result(self, result, ignore_page_boundaries: bool = False): details.append(f"candidate line={diff.candidate_index}") if details: message = f"{message} ({', '.join(details)})" - logger.info(message) + logger.debug(message) if diff.deltas: pretty = ", ".join(f"{axis}={value:.3f}" for axis, value in diff.deltas.items()) logger.debug(f"[PDF Structure] Page {page} deltas: {pretty}") - # Log document-level differences as INFO (for text-only mode) if result.document_differences: for diff in result.document_differences: message = f"[PDF Text] {diff.message}" @@ -1111,7 +1236,20 @@ def _log_structure_result(self, result, ignore_page_boundaries: bool = False): details.append(f"candidate position={diff.cand_index}") if details: message = f"{message} ({', '.join(details)})" - logger.info(message) + logger.debug(message) + + # Log word-level differences at DEBUG + if hasattr(result, 'word_differences') and result.word_differences: + for diff in result.word_differences: + message = f"[PDF Words] {diff.message}" + details = [] + if diff.ref_start_index is not None: + details.append(f"ref positions {diff.ref_start_index}-{diff.ref_end_index}") + if diff.cand_start_index is not None: + details.append(f"cand positions {diff.cand_start_index}-{diff.cand_end_index}") + if details: + message = f"{message} ({', '.join(details)})" + logger.debug(message) def _ensure_local_document(self, document): return download_file_from_url(document) if is_url(document) else document diff --git a/DocTest/StructureReportBuilder.py b/DocTest/StructureReportBuilder.py new file mode 100644 index 0000000..6e892d2 --- /dev/null +++ b/DocTest/StructureReportBuilder.py @@ -0,0 +1,558 @@ +"""Consolidated HTML report builder for PDF structure comparison results. + +Transforms a StructureComparisonResult into a single HTML fragment suitable +for rendering inside Robot Framework's log.html via logger.info(msg, html=True). +""" + +from __future__ import annotations + +import html as html_module +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union + +from DocTest.PdfStructureComparator import ( + DocumentTextDifference, + DocumentWordDifference, + LineDifference, + StructureComparisonResult, +) + +__all__ = [ + "build_structure_report", + "build_structure_report_plain_text", + "ReportMetadata", +] + +DEFAULT_CONTEXT_LINES = 3 +MAX_TEXT_DISPLAY_LENGTH = 120 +MAX_HUNKS_BEFORE_COLLAPSE = 20 + + +@dataclass +class ReportMetadata: + """Metadata displayed in the report header.""" + reference_name: str = "" + candidate_name: str = "" + comparison_mode: str = "" + page_count_ref: Optional[int] = None + page_count_cand: Optional[int] = None + exclusions_applied: List[str] = field(default_factory=list) + + +@dataclass +class ReportSummary: + """Aggregate statistics for the comparison.""" + total_differences: int = 0 + missing_count: int = 0 + extra_count: int = 0 + mismatch_count: int = 0 + geometry_count: int = 0 + other_count: int = 0 + hunk_count: int = 0 + + +def _escape(text: str) -> str: + return html_module.escape(str(text), quote=True) + + +def _truncate(text: str, max_length: int = MAX_TEXT_DISPLAY_LENGTH) -> str: + if len(text) <= max_length: + return text + return text[: max_length - 3] + "..." + + +def _classify_diff_type(diff_type: str) -> str: + """Map diff_type string to category.""" + if diff_type in ("missing_line", "missing_text", "missing_page", "missing_words"): + return "missing" + elif diff_type in ("extra_line", "extra_text", "extra_page", "extra_words"): + return "extra" + elif diff_type in ("text_mismatch", "word_mismatch"): + return "mismatch" + elif diff_type == "geometry_mismatch": + return "geometry" + else: + return "other" + + +def _get_diff_display(diff: Any) -> Tuple[str, str, Optional[str], Optional[str]]: + """Extract category, message, ref_text, cand_text from any diff type.""" + category = _classify_diff_type(diff.diff_type) + message = diff.message + + ref_text = None + cand_text = None + + if isinstance(diff, LineDifference): + ref_text = diff.ref_text + cand_text = diff.cand_text + elif isinstance(diff, DocumentTextDifference): + ref_text = diff.ref_text + cand_text = diff.cand_text + elif isinstance(diff, DocumentWordDifference): + ref_text = " ".join(diff.ref_words) if diff.ref_words else None + cand_text = " ".join(diff.cand_words) if diff.cand_words else None + + return category, message, ref_text, cand_text + + +_CATEGORY_STYLES = { + # (background, text_color, symbol) — chosen for WCAG AA contrast + "missing": ("#f8d7da", "#721c24", "-"), + "extra": ("#d4edda", "#155724", "+"), + "mismatch": ("#fff3cd", "#856404", "~"), + "geometry": ("#e2e3e5", "#383d41", "\u0394"), # delta symbol + "other": ("#e2e3e5", "#383d41", "!"), +} + + +def _compute_summary(result: StructureComparisonResult) -> ReportSummary: + """Compute aggregate statistics from a comparison result.""" + summary = ReportSummary() + + for diffs in result.page_differences.values(): + for d in diffs: + cat = _classify_diff_type(d.diff_type) + if cat == "missing": summary.missing_count += 1 + elif cat == "extra": summary.extra_count += 1 + elif cat == "mismatch": summary.mismatch_count += 1 + elif cat == "geometry": summary.geometry_count += 1 + else: summary.other_count += 1 + + for d in result.document_differences: + cat = _classify_diff_type(d.diff_type) + if cat == "missing": summary.missing_count += 1 + elif cat == "extra": summary.extra_count += 1 + elif cat == "mismatch": summary.mismatch_count += 1 + else: summary.other_count += 1 + + if hasattr(result, 'word_differences'): + for d in result.word_differences: + cat = _classify_diff_type(d.diff_type) + if cat == "missing": summary.missing_count += 1 + elif cat == "extra": summary.extra_count += 1 + elif cat == "mismatch": summary.mismatch_count += 1 + else: summary.other_count += 1 + + summary.total_differences = ( + summary.missing_count + summary.extra_count + + summary.mismatch_count + summary.geometry_count + summary.other_count + ) + return summary + + +def _render_diff_html(diff: Any) -> str: + """Render a single difference as an HTML div with color coding.""" + category, message, ref_text, cand_text = _get_diff_display(diff) + bg, fg, symbol = _CATEGORY_STYLES.get(category, ("#e2e3e5", "#383d41", "?")) + + parts = [] + parts.append(f'
') + + if category == "mismatch" and ref_text and cand_text: + parts.append(f'{_escape(symbol)} ref: "{_escape(_truncate(ref_text))}"') + parts.append(f'
  cand: "{_escape(_truncate(cand_text))}"') + elif category == "missing" and ref_text: + parts.append(f'{_escape(symbol)} "{_escape(_truncate(ref_text))}"') + elif category == "extra" and cand_text: + parts.append(f'{_escape(symbol)} "{_escape(_truncate(cand_text))}"') + elif category == "geometry": + deltas_str = "" + if hasattr(diff, 'deltas') and diff.deltas: + deltas_str = " (" + ", ".join(f"{k}={v:.3f}" for k, v in diff.deltas.items()) + ")" + text_display = ref_text or cand_text or "" + parts.append(f'{_escape(symbol)} "{_escape(_truncate(text_display))}"{_escape(deltas_str)}') + else: + parts.append(f'{_escape(symbol)} {_escape(_truncate(message))}') + + parts.append('
') + return "".join(parts) + + +def _render_diff_plain(diff: Any) -> str: + """Render a single difference as plain text.""" + category, message, ref_text, cand_text = _get_diff_display(diff) + _, _, symbol = _CATEGORY_STYLES.get(category, ("", "", "?")) + + if category == "mismatch" and ref_text and cand_text: + return f' {symbol} ref: "{_truncate(ref_text)}"\n cand: "{_truncate(cand_text)}"' + elif category == "missing" and ref_text: + return f' {symbol} "{_truncate(ref_text)}"' + elif category == "extra" and cand_text: + return f' {symbol} "{_truncate(cand_text)}"' + else: + return f' {symbol} {_truncate(message)}' + + +def _collect_all_diffs(result: StructureComparisonResult) -> List[Tuple[Any, str]]: + """Collect all differences with location labels for the overview table.""" + items: List[Tuple[Any, str]] = [] + for page_num in sorted(result.page_differences.keys()): + for d in result.page_differences[page_num]: + loc = f"Page {page_num}" + if isinstance(d, LineDifference): + idx = d.reference_index if d.reference_index is not None else d.candidate_index + if idx is not None: + loc += f", line {idx}" + items.append((d, loc)) + for d in result.document_differences: + idx = d.ref_index if d.ref_index is not None else d.cand_index + loc = f"line {idx}" if idx is not None else "document" + items.append((d, loc)) + if hasattr(result, 'word_differences'): + for d in result.word_differences: + idx = d.ref_start_index if d.ref_start_index is not None else d.cand_start_index + loc = f"word {idx}" if idx is not None else "document" + items.append((d, loc)) + return items + + +def _get_diff_index(diff: Any) -> int: + """Extract the primary positional index from a difference object.""" + if isinstance(diff, LineDifference): + idx = diff.reference_index if diff.reference_index is not None else diff.candidate_index + return idx if idx is not None else 999999 + elif isinstance(diff, DocumentTextDifference): + idx = diff.ref_index if diff.ref_index is not None else diff.cand_index + return idx if idx is not None else 999999 + elif isinstance(diff, DocumentWordDifference): + idx = diff.ref_start_index if diff.ref_start_index is not None else diff.cand_start_index + return idx if idx is not None else 999999 + return 999999 + + +def _group_into_hunks( + differences: Sequence[Any], + context_lines: int, + source_texts: Optional[List[str]] = None, +) -> List[dict]: + """Group contiguous differences into hunks with context. + + Returns list of dicts: {start_index, end_index, differences, context_before, context_after} + """ + if not differences: + return [] + + sorted_diffs = sorted(differences, key=_get_diff_index) + merge_threshold = 2 * context_lines + 1 + + hunks = [] + current_diffs = [sorted_diffs[0]] + current_start = _get_diff_index(sorted_diffs[0]) + current_end = current_start + + for diff in sorted_diffs[1:]: + idx = _get_diff_index(diff) + if idx - current_end <= merge_threshold: + current_diffs.append(diff) + current_end = max(current_end, idx) + else: + # Finalize current hunk + ctx_before = [] + ctx_after = [] + if source_texts: + start = max(0, current_start - context_lines) + ctx_before = source_texts[start:current_start] + end_pos = min(len(source_texts), current_end + context_lines + 1) + ctx_after = source_texts[current_end + 1:end_pos] + hunks.append({ + "start_index": current_start, + "end_index": current_end, + "differences": current_diffs, + "context_before": ctx_before, + "context_after": ctx_after, + }) + current_diffs = [diff] + current_start = idx + current_end = idx + + # Finalize last hunk + ctx_before = [] + ctx_after = [] + if source_texts: + start = max(0, current_start - context_lines) + ctx_before = source_texts[start:current_start] + end_pos = min(len(source_texts), current_end + context_lines + 1) + ctx_after = source_texts[current_end + 1:end_pos] + hunks.append({ + "start_index": current_start, + "end_index": current_end, + "differences": current_diffs, + "context_before": ctx_before, + "context_after": ctx_after, + }) + + return hunks + + +def build_structure_report( + result: StructureComparisonResult, + *, + metadata: Optional[ReportMetadata] = None, + context_lines: int = DEFAULT_CONTEXT_LINES, + reference_texts: Optional[List[str]] = None, + candidate_texts: Optional[List[str]] = None, +) -> str: + """Build a consolidated HTML report from a structure comparison result. + + Returns an HTML string suitable for logger.info(msg, html=True). + Returns empty string if result.passed is True. + """ + if result.passed: + return "" + + summary = _compute_summary(result) + parts = [] + + # Outer container — explicit bg+color so report is self-contained in both light/dark mode + parts.append('
') + + # Title + parts.append('
PDF Structure Comparison Report
') + + # Metadata + if metadata: + parts.append('
') + parts.append(f'
Reference: {_escape(metadata.reference_name)}
') + parts.append(f'
Candidate: {_escape(metadata.candidate_name)}
') + mode_str = _escape(metadata.comparison_mode) + page_str = "" + if metadata.page_count_ref is not None or metadata.page_count_cand is not None: + page_str = f' | Pages: {metadata.page_count_ref or "?"} ref / {metadata.page_count_cand or "?"} cand' + parts.append(f'
Mode: {mode_str}{page_str}
') + if metadata.exclusions_applied: + exc_str = ", ".join(_escape(e) for e in metadata.exclusions_applied) + parts.append(f'
Exclusions: {exc_str}
') + parts.append('
') + + # Summary + parts.append('
') + parts.append(f'
{summary.total_differences} difference(s)
') + parts.append('
') + if summary.missing_count: + parts.append(f'{summary.missing_count} missing') + if summary.extra_count: + parts.append(f'{summary.extra_count} extra') + if summary.mismatch_count: + parts.append(f'{summary.mismatch_count} mismatch') + if summary.geometry_count: + parts.append(f'{summary.geometry_count} geometry') + if summary.other_count: + parts.append(f'{summary.other_count} other') + parts.append('
') + + # Differences overview table + all_diffs_for_table = _collect_all_diffs(result) + if all_diffs_for_table: + parts.append('
') + parts.append('') + parts.append('' + '' + '' + '' + '' + '') + for row_idx, (diff, location) in enumerate(all_diffs_for_table, 1): + category, _, ref_text, cand_text = _get_diff_display(diff) + bg, fg, symbol = _CATEGORY_STYLES.get(category, ("#e2e3e5", "#383d41", "?")) + ref_cell = _escape(_truncate(ref_text, 60)) if ref_text else "—" + cand_cell = _escape(_truncate(cand_text, 60)) if cand_text else "—" + parts.append( + f'' + f'' + f'' + f'' + f'' + f'') + parts.append('
#TypeReferenceCandidateLocation
{row_idx}{_escape(symbol)} {_escape(category)}{ref_cell}{cand_cell}{_escape(location)}
') + + # Content sections (hunk detail) + parts.append('
') + + total_hunks = 0 + + # Page-level differences + if result.page_differences: + for page_num in sorted(result.page_differences.keys()): + diffs = result.page_differences[page_num] + hunks = _group_into_hunks(diffs, context_lines, reference_texts) + total_hunks += len(hunks) + parts.append(f'
' + f'Page {page_num} — {len(hunks)} hunk(s), {len(diffs)} difference(s)
') + for i, hunk in enumerate(hunks): + if total_hunks > MAX_HUNKS_BEFORE_COLLAPSE and i > 0: + parts.append(f'
... and more hunks (showing first {MAX_HUNKS_BEFORE_COLLAPSE})
') + break + _render_hunk_to_parts(parts, hunk, i + 1, index_label="line") + + # Document-level differences + if result.document_differences: + hunks = _group_into_hunks(result.document_differences, context_lines, reference_texts) + total_hunks += len(hunks) + parts.append(f'
' + f'Document (text-only) — {len(hunks)} hunk(s), {len(result.document_differences)} difference(s)
') + for i, hunk in enumerate(hunks): + if total_hunks > MAX_HUNKS_BEFORE_COLLAPSE: + remaining = len(hunks) - i + parts.append(f'
... {remaining} more hunk(s) not shown
') + break + _render_hunk_to_parts(parts, hunk, i + 1, index_label="line") + + # Word-level differences + if hasattr(result, 'word_differences') and result.word_differences: + hunks = _group_into_hunks(result.word_differences, context_lines, reference_texts) + total_hunks += len(hunks) + parts.append(f'
' + f'Document (word-level) — {len(hunks)} hunk(s), {len(result.word_differences)} difference(s)
') + rendered = 0 + for i, hunk in enumerate(hunks): + if rendered >= MAX_HUNKS_BEFORE_COLLAPSE: + remaining = len(hunks) - rendered + parts.append(f'
... {remaining} more hunk(s) not shown
') + break + _render_hunk_to_parts(parts, hunk, i + 1, index_label="word") + rendered += 1 + + # Summary line + if result.summary: + parts.append('
') + for entry in result.summary: + parts.append(f'
{_escape(str(entry))}
') + parts.append('
') + + parts.append('
') # close content + parts.append('
') # close outer container + + summary.hunk_count = total_hunks + return "\n".join(parts) + + +def _render_hunk_to_parts(parts: List[str], hunk: dict, hunk_number: int, index_label: str = "line") -> None: + """Render a hunk into the HTML parts list.""" + start = hunk["start_index"] + end = hunk["end_index"] + if start == end: + label = f"{index_label} {start}" + else: + label = f"{index_label}s {start}–{end}" + + parts.append(f'
') + parts.append(f'
Hunk {hunk_number} ({label})
') + + # Context before + if hunk["context_before"]: + ctx = " | ".join(_truncate(t, 40) for t in hunk["context_before"]) + parts.append(f'
... {_escape(ctx)} ...
') + + # Differences + for diff in hunk["differences"]: + parts.append(_render_diff_html(diff)) + + # Context after + if hunk["context_after"]: + ctx = " | ".join(_truncate(t, 40) for t in hunk["context_after"]) + parts.append(f'
... {_escape(ctx)} ...
') + + parts.append('
') + + +def build_structure_report_plain_text( + result: StructureComparisonResult, + *, + metadata: Optional[ReportMetadata] = None, + context_lines: int = DEFAULT_CONTEXT_LINES, + reference_texts: Optional[List[str]] = None, + candidate_texts: Optional[List[str]] = None, +) -> str: + """Build a plain-text version of the consolidated report. + + Returns empty string if result.passed is True. + """ + if result.passed: + return "" + + summary = _compute_summary(result) + lines = [] + + lines.append("=" * 60) + lines.append("PDF Structure Comparison Report") + lines.append("=" * 60) + + if metadata: + lines.append(f"Reference: {metadata.reference_name}") + lines.append(f"Candidate: {metadata.candidate_name}") + lines.append(f"Mode: {metadata.comparison_mode}") + if metadata.exclusions_applied: + lines.append(f"Exclusions: {', '.join(metadata.exclusions_applied)}") + + lines.append("-" * 60) + lines.append(f"{summary.total_differences} difference(s): " + f"{summary.missing_count} missing, {summary.extra_count} extra, " + f"{summary.mismatch_count} mismatch, {summary.geometry_count} geometry, " + f"{summary.other_count} other") + lines.append("-" * 60) + + # Page-level + if result.page_differences: + for page_num in sorted(result.page_differences.keys()): + diffs = result.page_differences[page_num] + hunks = _group_into_hunks(diffs, context_lines, reference_texts) + lines.append(f"\nPage {page_num} -- {len(hunks)} hunk(s), {len(diffs)} difference(s)") + for i, hunk in enumerate(hunks): + if i >= MAX_HUNKS_BEFORE_COLLAPSE: + lines.append(f" ... {len(hunks) - i} more hunk(s) not shown") + break + _render_hunk_plain(lines, hunk, i + 1, index_label="line") + + # Document-level + if result.document_differences: + hunks = _group_into_hunks(result.document_differences, context_lines, reference_texts) + lines.append(f"\nDocument (text-only) -- {len(hunks)} hunk(s), {len(result.document_differences)} difference(s)") + for i, hunk in enumerate(hunks): + if i >= MAX_HUNKS_BEFORE_COLLAPSE: + lines.append(f" ... {len(hunks) - i} more hunk(s) not shown") + break + _render_hunk_plain(lines, hunk, i + 1, index_label="line") + + # Word-level + if hasattr(result, 'word_differences') and result.word_differences: + hunks = _group_into_hunks(result.word_differences, context_lines, reference_texts) + lines.append(f"\nDocument (word-level) -- {len(hunks)} hunk(s), {len(result.word_differences)} difference(s)") + for i, hunk in enumerate(hunks): + if i >= MAX_HUNKS_BEFORE_COLLAPSE: + lines.append(f" ... {len(hunks) - i} more hunk(s) not shown") + break + _render_hunk_plain(lines, hunk, i + 1, index_label="word") + + if result.summary: + lines.append("") + for entry in result.summary: + lines.append(f"Note: {entry}") + + lines.append("=" * 60) + return "\n".join(lines) + + +def _render_hunk_plain(lines: List[str], hunk: dict, hunk_number: int, index_label: str = "line") -> None: + """Render a hunk into the plain text lines list.""" + start = hunk["start_index"] + end = hunk["end_index"] + if start == end: + label = f"{index_label} {start}" + else: + label = f"{index_label}s {start}-{end}" + lines.append(f" Hunk {hunk_number} ({label})") + + if hunk["context_before"]: + ctx = " | ".join(_truncate(t, 40) for t in hunk["context_before"]) + lines.append(f" ... {ctx} ...") + + for diff in hunk["differences"]: + lines.append(_render_diff_plain(diff)) + + if hunk["context_after"]: + ctx = " | ".join(_truncate(t, 40) for t in hunk["context_after"]) + lines.append(f" ... {ctx} ...") diff --git a/DocTest/TextNormalization.py b/DocTest/TextNormalization.py index 0153841..f8c9edf 100644 --- a/DocTest/TextNormalization.py +++ b/DocTest/TextNormalization.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Dict, Optional +from typing import Dict, List, Optional, Set, Tuple _LIGATURE_MAP: Dict[str, str] = { @@ -22,6 +22,71 @@ def normalize_ligatures(text: str) -> str: return "".join(_LIGATURE_MAP.get(char, char) for char in text) +_WORD_BOUNDARY_CONNECTORS: Set[str] = frozenset("/\\-") + + +def merge_split_words( + words: List[str], + tokens: "List[WordToken]", + connectors: Set[str] | None = None, +) -> "Tuple[List[str], List[WordToken]]": + """Merge word tokens that were split across PDF line boundaries. + + When text reflows across lines in a PDF, words containing connector + characters (like ``/``, ``-``, ``\\``) can be split into separate tokens. + For example, ``JS2_D48/F16/H8`` may become ``["JS2_D48/F16/", "H8"]`` + when the line break falls after the ``/``. + + This function detects such splits by looking for tokens from consecutive + lines where the preceding token ends with a connector character, and + merges them back into a single token. + + Args: + words: Flat list of word strings. + tokens: Corresponding WordToken provenance objects. + connectors: Set of characters that indicate a word was split. + Defaults to ``_WORD_BOUNDARY_CONNECTORS`` (``/``, ``\\``, ``-``). + + Returns: + Tuple of (merged_words, merged_tokens) with reduced length. + """ + if not words or len(words) <= 1: + return list(words), list(tokens) + + if connectors is None: + connectors = _WORD_BOUNDARY_CONNECTORS + + merged_words: List[str] = [words[0]] + merged_tokens: List[tokens[0].__class__] = [tokens[0]] + + for i in range(1, len(words)): + prev_token = merged_tokens[-1] + curr_token = tokens[i] + prev_word = merged_words[-1] + + # Only merge if tokens are from different lines AND previous word ends with connector. + # Skip standalone connectors (e.g. a bare "-" used as punctuation, not a split word). + if (prev_token.source_line_index != curr_token.source_line_index + and prev_word + and prev_word[-1] in connectors + and len(prev_word) > 1): + # Merge: concatenate words, keep first token's provenance + merged_words[-1] = prev_word + words[i] + # Update token with merged text + from DocTest.PdfStructureModels import WordToken + merged_tokens[-1] = WordToken( + text=merged_words[-1], + source_page=prev_token.source_page, + source_line_index=prev_token.source_line_index, + word_index=prev_token.word_index, + ) + else: + merged_words.append(words[i]) + merged_tokens.append(tokens[i]) + + return merged_words, merged_tokens + + def apply_character_replacements( text: str, replacements: Optional[Dict[str, str]] = None, diff --git a/utest/test_compare_document_words.py b/utest/test_compare_document_words.py new file mode 100644 index 0000000..f579e1a --- /dev/null +++ b/utest/test_compare_document_words.py @@ -0,0 +1,268 @@ +"""Unit tests for compare_document_words() -- ADR-001 Word-Level Token Comparison.""" + +import pytest + +from DocTest.PdfStructureComparator import ( + DocumentWordDifference, + StructureComparisonResult, + compare_document_words, +) +from DocTest.PdfStructureModels import ( + DocumentStructure, + PageStructure, + StructureExtractionConfig, + TextBlock, + TextLine, +) + + +# --------------------------------------------------------------------------- +# Helper +# --------------------------------------------------------------------------- + + +def _make_doc(*page_texts): + """Create a DocumentStructure from lists of line texts per page. + + Usage: _make_doc(["line1", "line2"], ["line3"]) creates 2 pages. + Each positional argument is a list of line-text strings for one page. + All lines are placed in a single block per page. + """ + config = StructureExtractionConfig() + pages = [] + for page_num, lines in enumerate(page_texts): + text_lines = [] + for i, text in enumerate(lines): + text_lines.append( + TextLine(index=i, text=text, bbox=(0.0, 0.0, 100.0, 10.0)) + ) + block = TextBlock(index=0, bbox=(0.0, 0.0, 100.0, 100.0), lines=text_lines) + page = PageStructure( + page_number=page_num, width=612.0, height=792.0, blocks=[block] + ) + pages.append(page) + return DocumentStructure(pages=pages, config=config) + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +def test_identical_content_same_lines(): + """Same text, same lines -> passed=True, no word_differences.""" + ref = _make_doc(["the quick brown fox"]) + cand = _make_doc(["the quick brown fox"]) + result = compare_document_words(ref, cand) + assert result.passed + assert result.word_differences == [] + + +def test_identical_content_different_lines(): + """Same words split across different lines -> passed=True. + + This is the KEY test for reflow tolerance: the words are identical, + only the line breaks differ. + """ + ref = _make_doc(["the quick brown", "fox jumps"]) + cand = _make_doc(["the quick", "brown fox jumps"]) + result = compare_document_words(ref, cand) + assert result.passed + assert result.word_differences == [] + + +def test_identical_content_different_pages(): + """Same words on different pages -> passed=True.""" + ref = _make_doc(["hello world"], ["foo bar"]) + cand = _make_doc(["hello world foo bar"]) + result = compare_document_words(ref, cand) + assert result.passed + assert result.word_differences == [] + + +def test_single_word_replacement(): + """'fox' vs 'cat' -> one word_mismatch difference.""" + ref = _make_doc(["the quick fox"]) + cand = _make_doc(["the quick cat"]) + result = compare_document_words(ref, cand) + assert not result.passed + assert len(result.word_differences) >= 1 + mismatch_diffs = [ + d for d in result.word_differences if d.diff_type == "word_mismatch" + ] + assert len(mismatch_diffs) >= 1 + diff = mismatch_diffs[0] + assert "fox" in diff.ref_words + assert "cat" in diff.cand_words + + +def test_single_word_insertion(): + """Candidate has extra word -> one extra_words difference.""" + ref = _make_doc(["the fox"]) + cand = _make_doc(["the quick fox"]) + result = compare_document_words(ref, cand) + assert not result.passed + extra_diffs = [ + d for d in result.word_differences if d.diff_type == "extra_words" + ] + assert len(extra_diffs) >= 1 + diff = extra_diffs[0] + assert "quick" in diff.cand_words + + +def test_single_word_deletion(): + """Candidate missing a word -> one missing_words difference.""" + ref = _make_doc(["the quick fox"]) + cand = _make_doc(["the fox"]) + result = compare_document_words(ref, cand) + assert not result.passed + missing_diffs = [ + d for d in result.word_differences if d.diff_type == "missing_words" + ] + assert len(missing_diffs) >= 1 + diff = missing_diffs[0] + assert "quick" in diff.ref_words + + +def test_multi_word_replacement(): + """Contiguous block of different words -> one grouped mismatch.""" + ref = _make_doc(["the quick brown fox"]) + cand = _make_doc(["the slow red fox"]) + result = compare_document_words(ref, cand) + assert not result.passed + mismatch_diffs = [ + d for d in result.word_differences if d.diff_type == "word_mismatch" + ] + assert len(mismatch_diffs) >= 1 + # The replaced block should be grouped into a single diff + diff = mismatch_diffs[0] + assert diff.ref_words is not None + assert diff.cand_words is not None + assert "quick" in diff.ref_words + assert "brown" in diff.ref_words + assert "slow" in diff.cand_words + assert "red" in diff.cand_words + + +def test_case_sensitive_default(): + """'Hello' vs 'hello' -> mismatch when case_sensitive=True (default).""" + ref = _make_doc(["Hello World"]) + cand = _make_doc(["hello World"]) + result = compare_document_words(ref, cand) + assert not result.passed + assert len(result.word_differences) >= 1 + + +def test_case_insensitive(): + """'Hello' vs 'hello' -> passed=True when case_sensitive=False.""" + ref = _make_doc(["Hello WORLD"]) + cand = _make_doc(["hello world"]) + result = compare_document_words(ref, cand, case_sensitive=False) + assert result.passed + assert result.word_differences == [] + + +def test_both_empty_documents(): + """Both empty -> passed=True.""" + ref = _make_doc() + cand = _make_doc() + result = compare_document_words(ref, cand) + assert result.passed + assert result.word_differences == [] + + +def test_one_empty_one_not(): + """One empty, one with text -> differences reported.""" + ref = _make_doc(["hello world"]) + cand = _make_doc() + result = compare_document_words(ref, cand) + assert not result.passed + assert len(result.word_differences) >= 1 + + +def test_difference_count_includes_word_diffs(): + """result.difference_count() counts word_differences.""" + ref = _make_doc(["the quick fox"]) + cand = _make_doc(["the slow fox"]) + result = compare_document_words(ref, cand) + assert result.difference_count() >= 1 + assert result.difference_count() >= len(result.word_differences) + + +def test_word_differences_have_correct_indices(): + """Verify ref_start_index/ref_end_index/cand_start_index/cand_end_index.""" + ref = _make_doc(["a b c d e"]) + cand = _make_doc(["a b x d e"]) # 'c' replaced by 'x' + result = compare_document_words(ref, cand) + assert not result.passed + assert len(result.word_differences) >= 1 + + diff = result.word_differences[0] + # The replaced word 'c' is at index 2 in the reference + assert diff.ref_start_index is not None + assert diff.ref_end_index is not None + assert diff.cand_start_index is not None + assert diff.cand_end_index is not None + # 'c' is the 3rd word (index 2), so ref range should be [2, 3) + assert diff.ref_start_index == 2 + assert diff.ref_end_index == 3 + # 'x' is the 3rd word (index 2), so cand range should be [2, 3) + assert diff.cand_start_index == 2 + assert diff.cand_end_index == 3 + + +def test_reflow_across_lines_and_pages(): + """Complex reflow scenario: identical words, different line/page breaks. + + Reference: + page 0: ["The quick brown fox", "jumps over the"] + page 1: ["lazy dog"] + + Candidate: + page 0: ["The quick", "brown fox jumps"] + page 1: ["over the lazy dog"] + + Should pass because the word sequence is identical. + """ + ref = _make_doc( + ["The quick brown fox", "jumps over the"], + ["lazy dog"], + ) + cand = _make_doc( + ["The quick", "brown fox jumps"], + ["over the lazy dog"], + ) + result = compare_document_words(ref, cand) + assert result.passed + assert result.word_differences == [] + + +def test_result_is_structure_comparison_result(): + """compare_document_words returns a StructureComparisonResult.""" + ref = _make_doc(["hello"]) + cand = _make_doc(["hello"]) + result = compare_document_words(ref, cand) + assert isinstance(result, StructureComparisonResult) + + +def test_word_difference_has_message(): + """Each DocumentWordDifference has a non-empty message.""" + ref = _make_doc(["hello world"]) + cand = _make_doc(["hello earth"]) + result = compare_document_words(ref, cand) + assert not result.passed + for diff in result.word_differences: + assert isinstance(diff.message, str) + assert len(diff.message) > 0 + + +def test_empty_ref_nonempty_cand(): + """Empty reference, non-empty candidate -> extra words reported.""" + ref = _make_doc() + cand = _make_doc(["hello world"]) + result = compare_document_words(ref, cand) + assert not result.passed + extra_diffs = [ + d for d in result.word_differences if d.diff_type == "extra_words" + ] + assert len(extra_diffs) >= 1 diff --git a/utest/test_flatten_document_words.py b/utest/test_flatten_document_words.py new file mode 100644 index 0000000..f977d84 --- /dev/null +++ b/utest/test_flatten_document_words.py @@ -0,0 +1,164 @@ +"""Unit tests for flatten_document_words() -- ADR-001 Word-Level Token Comparison.""" + +import pytest + +from DocTest.PdfStructureModels import ( + DocumentStructure, + PageStructure, + StructureExtractionConfig, + TextBlock, + TextLine, + WordToken, + flatten_document_words, +) + + +# --------------------------------------------------------------------------- +# Helper +# --------------------------------------------------------------------------- + + +def _make_doc(*page_texts): + """Create a DocumentStructure from lists of line texts per page. + + Usage: _make_doc(["line1", "line2"], ["line3"]) creates 2 pages. + Each positional argument is a list of line-text strings for one page. + All lines are placed in a single block per page. + """ + config = StructureExtractionConfig() + pages = [] + for page_num, lines in enumerate(page_texts): + text_lines = [] + for i, text in enumerate(lines): + text_lines.append( + TextLine(index=i, text=text, bbox=(0.0, 0.0, 100.0, 10.0)) + ) + block = TextBlock(index=0, bbox=(0.0, 0.0, 100.0, 100.0), lines=text_lines) + page = PageStructure( + page_number=page_num, width=612.0, height=792.0, blocks=[block] + ) + pages.append(page) + return DocumentStructure(pages=pages, config=config) + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +def test_empty_document(): + """Empty DocumentStructure returns ([], []).""" + doc = _make_doc() + words, tokens = flatten_document_words(doc) + assert words == [] + assert tokens == [] + + +def test_single_line_single_word(): + """One line 'hello' produces ['hello'] and one WordToken.""" + doc = _make_doc(["hello"]) + words, tokens = flatten_document_words(doc) + assert words == ["hello"] + assert len(tokens) == 1 + assert tokens[0].text == "hello" + + +def test_single_line_multiple_words(): + """'hello world' produces ['hello', 'world'] and two WordTokens.""" + doc = _make_doc(["hello world"]) + words, tokens = flatten_document_words(doc) + assert words == ["hello", "world"] + assert len(tokens) == 2 + assert tokens[0].text == "hello" + assert tokens[1].text == "world" + + +def test_multiple_lines(): + """Two lines 'foo bar' and 'baz' produce ['foo', 'bar', 'baz'] in order.""" + doc = _make_doc(["foo bar", "baz"]) + words, tokens = flatten_document_words(doc) + assert words == ["foo", "bar", "baz"] + assert len(tokens) == 3 + + +def test_multiple_pages(): + """Words from page 0 and page 1 are concatenated in order.""" + doc = _make_doc(["alpha beta"], ["gamma"]) + words, tokens = flatten_document_words(doc) + assert words == ["alpha", "beta", "gamma"] + assert len(tokens) == 3 + + +def test_empty_lines_skipped(): + """Lines with empty text produce no tokens.""" + doc = _make_doc(["hello", "", "world"]) + words, tokens = flatten_document_words(doc) + assert words == ["hello", "world"] + assert len(tokens) == 2 + + +def test_whitespace_only_lines_skipped(): + """Lines with only whitespace produce no tokens (split yields []).""" + doc = _make_doc(["hello", " ", "world"]) + words, tokens = flatten_document_words(doc) + assert words == ["hello", "world"] + assert len(tokens) == 2 + + +def test_provenance_metadata_correct(): + """source_page, source_line_index, and word_index are correct across pages.""" + doc = _make_doc(["a b"], ["c"]) + words, tokens = flatten_document_words(doc) + + # First page, first line, word 0 + assert tokens[0].text == "a" + assert tokens[0].source_page == 0 + assert tokens[0].source_line_index == 0 + assert tokens[0].word_index == 0 + + # First page, first line, word 1 + assert tokens[1].text == "b" + assert tokens[1].source_page == 0 + assert tokens[1].source_line_index == 0 + assert tokens[1].word_index == 1 + + # Second page, first line, word 2 + assert tokens[2].text == "c" + assert tokens[2].source_page == 1 + assert tokens[2].source_line_index == 1 + assert tokens[2].word_index == 2 + + +def test_multiple_spaces_normalized(): + """'hello world' is split to ['hello', 'world'] (str.split normalizes).""" + doc = _make_doc(["hello world"]) + words, tokens = flatten_document_words(doc) + assert words == ["hello", "world"] + assert len(tokens) == 2 + + +def test_word_index_is_global(): + """word_index is sequential across all pages, blocks, and lines.""" + doc = _make_doc(["a b", "c"], ["d e f"]) + words, tokens = flatten_document_words(doc) + assert words == ["a", "b", "c", "d", "e", "f"] + + expected_indices = list(range(6)) + actual_indices = [t.word_index for t in tokens] + assert actual_indices == expected_indices + + +def test_word_token_is_frozen(): + """WordToken instances are immutable (frozen dataclass).""" + token = WordToken(text="hello", source_page=0, source_line_index=0, word_index=0) + with pytest.raises(AttributeError): + token.text = "changed" + + +def test_words_and_tokens_have_same_length(): + """The word strings list and tokens list always have the same length.""" + doc = _make_doc(["the quick brown fox", "jumps over"], ["the lazy dog"]) + words, tokens = flatten_document_words(doc) + assert len(words) == len(tokens) + for word, token in zip(words, tokens): + assert word == token.text diff --git a/utest/test_header_footer_detection.py b/utest/test_header_footer_detection.py new file mode 100644 index 0000000..ec210b3 --- /dev/null +++ b/utest/test_header_footer_detection.py @@ -0,0 +1,539 @@ +"""Unit tests for HeaderFooterDetector module (ADR-002). + +Tests cover repetition-based detection of headers/footers, stripping of +detected lines, digit normalization for page numbers, and the convenience +filter_headers_footers function. +""" + +import pytest + +from DocTest.HeaderFooterDetector import ( + DetectionResult, + HeaderFooterConfig, + _normalize_for_grouping, + detect_repeating_headers_footers, + filter_headers_footers, + strip_detected_headers_footers, +) +from DocTest.PdfStructureModels import ( + DocumentStructure, + PageStructure, + StructureExtractionConfig, + TextBlock, + TextLine, + flatten_document_text, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_page(page_number, lines_data, width=612, height=792): + """Create a PageStructure from line data. + + Args: + page_number: The 1-based page number. + lines_data: list of (text, y_top, y_bottom) tuples. + Each line gets bbox = (0, y_top, width, y_bottom). + width: Page width in PDF points. + height: Page height in PDF points. + + Returns: + A PageStructure suitable for testing. + """ + text_lines = [] + for i, (text, y_top, y_bottom) in enumerate(lines_data): + text_lines.append( + TextLine( + index=i, + text=text, + bbox=(0.0, float(y_top), float(width), float(y_bottom)), + ) + ) + block = TextBlock(index=0, bbox=(0, 0, width, height), lines=text_lines) + return PageStructure( + page_number=page_number, width=width, height=height, blocks=[block] + ) + + +def _make_doc(*pages): + """Create a DocumentStructure from PageStructure objects.""" + config = StructureExtractionConfig() + return DocumentStructure(pages=list(pages), config=config) + + +# --------------------------------------------------------------------------- +# Normalization helper tests +# --------------------------------------------------------------------------- + + +class TestNormalizeForGrouping: + """Tests for the _normalize_for_grouping helper.""" + + def test_replaces_single_digit(self): + assert _normalize_for_grouping("Page 1") == "Page #" + + def test_replaces_multiple_digit_runs(self): + assert _normalize_for_grouping("Page 1 of 5") == "Page # of #" + + def test_no_digits_unchanged(self): + assert _normalize_for_grouping("ACME Corp") == "ACME Corp" + + def test_multi_digit_run(self): + assert _normalize_for_grouping("2024-01-15") == "#-#-#" + + def test_empty_string(self): + assert _normalize_for_grouping("") == "" + + def test_standalone_page_number(self): + assert _normalize_for_grouping("42") == "#" + + +# --------------------------------------------------------------------------- +# Config tests +# --------------------------------------------------------------------------- + + +class TestHeaderFooterConfig: + """Tests for HeaderFooterConfig properties.""" + + def test_detection_disabled_when_scan_height_zero(self): + """Both scan heights 0 means detection is disabled.""" + config = HeaderFooterConfig(header_scan_height=0, footer_scan_height=0) + assert config.enabled is False + + def test_config_enabled_with_header_only(self): + """Detection is enabled when only header_scan_height > 0.""" + config = HeaderFooterConfig(header_scan_height=50, footer_scan_height=0) + assert config.enabled is True + + def test_config_enabled_with_footer_only(self): + """Detection is enabled when only footer_scan_height > 0.""" + config = HeaderFooterConfig(header_scan_height=0, footer_scan_height=50) + assert config.enabled is True + + def test_config_enabled_with_both(self): + """Detection is enabled when both scan heights > 0.""" + config = HeaderFooterConfig(header_scan_height=50, footer_scan_height=50) + assert config.enabled is True + + +# --------------------------------------------------------------------------- +# Detection tests +# --------------------------------------------------------------------------- + + +class TestDetectRepeatingHeadersFooters: + """Tests for detect_repeating_headers_footers.""" + + def test_disabled_config_returns_empty_result(self): + """When config.enabled is False, detection returns empty result.""" + doc = _make_doc( + _make_page(1, [("ACME Corp", 10, 25), ("Body text", 100, 115)]), + _make_page(2, [("ACME Corp", 10, 25), ("More text", 100, 115)]), + ) + config = HeaderFooterConfig(header_scan_height=0, footer_scan_height=0) + result = detect_repeating_headers_footers(doc, config) + assert result.header_keys == frozenset() + assert result.footer_keys == frozenset() + assert result.has_detections is False + + def test_detects_identical_header_on_all_pages(self): + """Identical text in header region on all pages is detected.""" + doc = _make_doc( + _make_page(1, [("ACME Corp", 10, 25), ("Body page 1", 100, 115)]), + _make_page(2, [("ACME Corp", 10, 25), ("Body page 2", 100, 115)]), + _make_page(3, [("ACME Corp", 10, 25), ("Body page 3", 100, 115)]), + ) + config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2) + result = detect_repeating_headers_footers(doc, config) + assert "ACME Corp" in result.header_keys + assert result.has_detections is True + + def test_does_not_detect_non_repeating_text(self): + """Unique text in header region across pages is not detected.""" + doc = _make_doc( + _make_page(1, [("Title A", 10, 25), ("Body 1", 100, 115)]), + _make_page(2, [("Title B", 10, 25), ("Body 2", 100, 115)]), + _make_page(3, [("Title C", 10, 25), ("Body 3", 100, 115)]), + ) + config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2) + result = detect_repeating_headers_footers(doc, config) + assert result.header_keys == frozenset() + assert result.has_detections is False + + def test_detects_header_with_page_numbers(self): + """Page-number variants normalize to the same key and are detected.""" + doc = _make_doc( + _make_page(1, [("Page 1 of 5", 10, 25), ("Body 1", 100, 115)]), + _make_page(2, [("Page 2 of 5", 10, 25), ("Body 2", 100, 115)]), + _make_page(3, [("Page 3 of 5", 10, 25), ("Body 3", 100, 115)]), + ) + config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2) + result = detect_repeating_headers_footers(doc, config) + assert "Page # of #" in result.header_keys + + def test_respects_repeat_threshold_below(self): + """Text repeating on fewer pages than threshold is not detected.""" + doc = _make_doc( + _make_page(1, [("Header", 10, 25), ("Body 1", 100, 115)]), + _make_page(2, [("Header", 10, 25), ("Body 2", 100, 115)]), + _make_page(3, [("Header", 10, 25), ("Body 3", 100, 115)]), + _make_page(4, [("Unique", 10, 25), ("Body 4", 100, 115)]), + _make_page(5, [("Unique2", 10, 25), ("Body 5", 100, 115)]), + ) + config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=4) + result = detect_repeating_headers_footers(doc, config) + # "Header" only on 3 pages, threshold is 4 + assert "Header" not in result.header_keys + + def test_respects_repeat_threshold_at_boundary(self): + """Text repeating on exactly threshold pages is detected.""" + doc = _make_doc( + _make_page(1, [("Header", 10, 25), ("Body 1", 100, 115)]), + _make_page(2, [("Header", 10, 25), ("Body 2", 100, 115)]), + _make_page(3, [("Header", 10, 25), ("Body 3", 100, 115)]), + _make_page(4, [("Unique", 10, 25), ("Body 4", 100, 115)]), + _make_page(5, [("Unique2", 10, 25), ("Body 5", 100, 115)]), + ) + config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=3) + result = detect_repeating_headers_footers(doc, config) + assert "Header" in result.header_keys + + def test_single_page_no_detection(self): + """Single page document never reaches threshold=2.""" + doc = _make_doc( + _make_page(1, [("Header", 10, 25), ("Body", 100, 115)]), + ) + config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2) + result = detect_repeating_headers_footers(doc, config) + assert result.header_keys == frozenset() + assert result.has_detections is False + + def test_footer_detection(self): + """Text in footer region repeating across pages is detected.""" + # Page height = 792, footer_scan_height = 50 -> boundary at 742 + # Lines at y_bottom=770 are past 742 -> in footer region + doc = _make_doc( + _make_page(1, [("Body 1", 100, 115), ("Copyright 2024", 755, 770)]), + _make_page(2, [("Body 2", 100, 115), ("Copyright 2024", 755, 770)]), + _make_page(3, [("Body 3", 100, 115), ("Copyright 2024", 755, 770)]), + ) + config = HeaderFooterConfig(footer_scan_height=50, repeat_threshold=2) + result = detect_repeating_headers_footers(doc, config) + assert "Copyright #" in result.footer_keys + assert result.has_detections is True + + def test_header_and_footer_simultaneously(self): + """Both header and footer can be detected independently.""" + doc = _make_doc( + _make_page(1, [("ACME Corp", 10, 25), ("Body 1", 400, 415), ("Page 1", 760, 775)]), + _make_page(2, [("ACME Corp", 10, 25), ("Body 2", 400, 415), ("Page 2", 760, 775)]), + _make_page(3, [("ACME Corp", 10, 25), ("Body 3", 400, 415), ("Page 3", 760, 775)]), + ) + config = HeaderFooterConfig( + header_scan_height=50, footer_scan_height=50, repeat_threshold=2 + ) + result = detect_repeating_headers_footers(doc, config) + assert "ACME Corp" in result.header_keys + assert "Page #" in result.footer_keys + + def test_standalone_page_number_detection(self): + """Standalone page numbers like '1', '2', '3' normalize to '#'.""" + doc = _make_doc( + _make_page(1, [("1", 10, 25), ("Body 1", 100, 115)]), + _make_page(2, [("2", 10, 25), ("Body 2", 100, 115)]), + _make_page(3, [("3", 10, 25), ("Body 3", 100, 115)]), + ) + config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2) + result = detect_repeating_headers_footers(doc, config) + assert "#" in result.header_keys + + def test_threshold_greater_than_page_count(self): + """When threshold exceeds page count, nothing can be detected.""" + doc = _make_doc( + _make_page(1, [("Header", 10, 25), ("Body 1", 100, 115)]), + _make_page(2, [("Header", 10, 25), ("Body 2", 100, 115)]), + _make_page(3, [("Header", 10, 25), ("Body 3", 100, 115)]), + ) + config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=5) + result = detect_repeating_headers_footers(doc, config) + assert result.header_keys == frozenset() + assert result.has_detections is False + + def test_line_outside_scan_region_not_counted(self): + """Text at y > header_scan_height is not counted as a header candidate.""" + doc = _make_doc( + _make_page(1, [("ACME Corp", 60, 75), ("Body 1", 100, 115)]), + _make_page(2, [("ACME Corp", 60, 75), ("Body 2", 100, 115)]), + _make_page(3, [("ACME Corp", 60, 75), ("Body 3", 100, 115)]), + ) + config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2) + result = detect_repeating_headers_footers(doc, config) + # y_top=60 >= header_scan_height=50, so not in header region + assert "ACME Corp" not in result.header_keys + + +# --------------------------------------------------------------------------- +# Stripping tests +# --------------------------------------------------------------------------- + + +class TestStripDetectedHeadersFooters: + """Tests for strip_detected_headers_footers.""" + + def test_strips_detected_headers_preserves_body(self): + """Detected header lines are removed; body lines remain.""" + doc = _make_doc( + _make_page(1, [("ACME Corp", 10, 25), ("Body line 1", 100, 115), ("Body line 2", 200, 215)]), + _make_page(2, [("ACME Corp", 10, 25), ("Body line 3", 100, 115)]), + ) + config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2) + detection = detect_repeating_headers_footers(doc, config) + result = strip_detected_headers_footers(doc, detection, config) + + # All body lines preserved + all_texts = flatten_document_text(result) + assert "Body line 1" in all_texts + assert "Body line 2" in all_texts + assert "Body line 3" in all_texts + # Header removed + assert "ACME Corp" not in all_texts + + def test_body_text_matching_header_not_stripped(self): + """Same text in body region is preserved even if it matches a header key.""" + doc = _make_doc( + _make_page(1, [("ACME Corp", 10, 25), ("ACME Corp", 400, 415)]), + _make_page(2, [("ACME Corp", 10, 25), ("Other body", 400, 415)]), + _make_page(3, [("ACME Corp", 10, 25), ("More body", 400, 415)]), + ) + config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2) + detection = detect_repeating_headers_footers(doc, config) + result = strip_detected_headers_footers(doc, detection, config) + + # Page 1: header "ACME Corp" at y=10 removed, body "ACME Corp" at y=400 preserved + page1_texts = [] + for block in result.pages[0].blocks: + for line in block.lines: + page1_texts.append(line.text) + assert "ACME Corp" in page1_texts # The body-region instance survives + + def test_strips_page_number_variants(self): + """Different page-number variants sharing the same key are all stripped.""" + doc = _make_doc( + _make_page(1, [("Page 1 of 5", 10, 25), ("Body A", 100, 115)]), + _make_page(2, [("Page 2 of 5", 10, 25), ("Body B", 100, 115)]), + ) + config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2) + detection = detect_repeating_headers_footers(doc, config) + result = strip_detected_headers_footers(doc, detection, config) + + all_texts = flatten_document_text(result) + assert "Page 1 of 5" not in all_texts + assert "Page 2 of 5" not in all_texts + assert "Body A" in all_texts + assert "Body B" in all_texts + + def test_re_indexing_after_strip(self): + """After stripping, remaining lines have contiguous indices starting at 0.""" + doc = _make_doc( + _make_page(1, [ + ("Header", 10, 25), + ("Line A", 100, 115), + ("Line B", 200, 215), + ("Line C", 300, 315), + ]), + _make_page(2, [ + ("Header", 10, 25), + ("Line D", 100, 115), + ]), + ) + config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2) + detection = detect_repeating_headers_footers(doc, config) + result = strip_detected_headers_footers(doc, detection, config) + + # Page 1 should have lines indexed 0, 1, 2 + page1_indices = [ + line.index for block in result.pages[0].blocks for line in block.lines + ] + assert page1_indices == [0, 1, 2] + + # Page 2 should have line indexed 0 + page2_indices = [ + line.index for block in result.pages[1].blocks for line in block.lines + ] + assert page2_indices == [0] + + def test_empty_blocks_removed_after_strip(self): + """A block whose only line is a header gets removed entirely.""" + # Create a page with two blocks: one with only a header, one with body + header_line = TextLine(index=0, text="Header", bbox=(0.0, 10.0, 612.0, 25.0)) + body_line = TextLine(index=1, text="Body text", bbox=(0.0, 100.0, 612.0, 115.0)) + header_block = TextBlock(index=0, bbox=(0, 0, 612, 30), lines=[header_line]) + body_block = TextBlock(index=1, bbox=(0, 90, 612, 120), lines=[body_line]) + page1 = PageStructure(page_number=1, width=612, height=792, blocks=[header_block, body_block]) + + header_line2 = TextLine(index=0, text="Header", bbox=(0.0, 10.0, 612.0, 25.0)) + body_line2 = TextLine(index=1, text="More text", bbox=(0.0, 100.0, 612.0, 115.0)) + header_block2 = TextBlock(index=0, bbox=(0, 0, 612, 30), lines=[header_line2]) + body_block2 = TextBlock(index=1, bbox=(0, 90, 612, 120), lines=[body_line2]) + page2 = PageStructure(page_number=2, width=612, height=792, blocks=[header_block2, body_block2]) + + doc = _make_doc(page1, page2) + config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2) + detection = detect_repeating_headers_footers(doc, config) + result = strip_detected_headers_footers(doc, detection, config) + + # Each page should have only 1 block (body_block), header_block removed + for page in result.pages: + assert len(page.blocks) == 1 + assert page.blocks[0].lines[0].text != "Header" + + def test_strips_footer_preserves_header_region(self): + """Footer stripping does not affect header-region text.""" + doc = _make_doc( + _make_page(1, [("Title", 10, 25), ("Body", 400, 415), ("Footer", 760, 775)]), + _make_page(2, [("Title", 10, 25), ("Body 2", 400, 415), ("Footer", 760, 775)]), + _make_page(3, [("Title", 10, 25), ("Body 3", 400, 415), ("Footer", 760, 775)]), + ) + # Only detect footer, not header + config = HeaderFooterConfig(header_scan_height=0, footer_scan_height=50, repeat_threshold=2) + detection = detect_repeating_headers_footers(doc, config) + result = strip_detected_headers_footers(doc, detection, config) + + all_texts = flatten_document_text(result) + # Footer should be removed + assert "Footer" not in all_texts + # Header-region text preserved (not scanned as header since header_scan_height=0) + assert all_texts.count("Title") == 3 + + def test_no_detections_returns_original_structure(self): + """When detection has no results, strip returns the original structure.""" + doc = _make_doc( + _make_page(1, [("Unique A", 10, 25), ("Body", 100, 115)]), + _make_page(2, [("Unique B", 10, 25), ("Body 2", 100, 115)]), + ) + detection = DetectionResult(header_keys=frozenset(), footer_keys=frozenset()) + config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2) + result = strip_detected_headers_footers(doc, detection, config) + assert result is doc # Identity check: same object returned + + +# --------------------------------------------------------------------------- +# Convenience function tests +# --------------------------------------------------------------------------- + + +class TestFilterHeadersFooters: + """Tests for the filter_headers_footers convenience function.""" + + def test_filter_headers_footers_end_to_end(self): + """filter_headers_footers produces same result as detect + strip.""" + doc = _make_doc( + _make_page(1, [("ACME Corp", 10, 25), ("Body 1", 100, 115), ("Page 1", 760, 775)]), + _make_page(2, [("ACME Corp", 10, 25), ("Body 2", 100, 115), ("Page 2", 760, 775)]), + _make_page(3, [("ACME Corp", 10, 25), ("Body 3", 100, 115), ("Page 3", 760, 775)]), + ) + config = HeaderFooterConfig( + header_scan_height=50, footer_scan_height=50, repeat_threshold=2 + ) + + # Manual two-step + detection = detect_repeating_headers_footers(doc, config) + expected = strip_detected_headers_footers(doc, detection, config) + + # Convenience one-step + actual = filter_headers_footers(doc, config) + + # Compare text content + assert flatten_document_text(actual) == flatten_document_text(expected) + + def test_filter_disabled_returns_same_object(self): + """When config.enabled is False, the exact same object is returned.""" + doc = _make_doc( + _make_page(1, [("Header", 10, 25), ("Body", 100, 115)]), + _make_page(2, [("Header", 10, 25), ("Body 2", 100, 115)]), + ) + config = HeaderFooterConfig(header_scan_height=0, footer_scan_height=0) + result = filter_headers_footers(doc, config) + assert result is doc + + +# --------------------------------------------------------------------------- +# Key scenario: page without header content preserved +# --------------------------------------------------------------------------- + + +class TestPageWithoutHeaderContentPreserved: + """The key scenario: a page that lacks the repeating header must not + have its body text incorrectly removed.""" + + def test_page_without_header_content_preserved(self): + """Page 2 has 'HEADER' but page 3 starts with different body text at + the same y-position. That body text must NOT be removed.""" + doc = _make_doc( + _make_page(1, [ + ("HEADER", 10, 25), + ("Body page 1", 100, 115), + ]), + _make_page(2, [ + ("HEADER", 10, 25), + ("Body page 2", 100, 115), + ]), + _make_page(3, [ + # No header line -- body text starts at y=10, same as header + ("Important content", 10, 25), + ("Body page 3", 100, 115), + ]), + ) + config = HeaderFooterConfig(header_scan_height=50, repeat_threshold=2) + detection = detect_repeating_headers_footers(doc, config) + + # "HEADER" detected as header key + assert "HEADER" in detection.header_keys + + result = strip_detected_headers_footers(doc, detection, config) + all_texts = flatten_document_text(result) + + # "HEADER" removed from pages 1 and 2 + assert "HEADER" not in all_texts + # "Important content" on page 3 preserved (different key) + assert "Important content" in all_texts + # All body text preserved + assert "Body page 1" in all_texts + assert "Body page 2" in all_texts + assert "Body page 3" in all_texts + + def test_page_without_footer_content_preserved(self): + """Symmetric case: a page missing the footer has its body text at + the bottom preserved.""" + doc = _make_doc( + _make_page(1, [ + ("Body 1", 100, 115), + ("FOOTER", 760, 775), + ]), + _make_page(2, [ + ("Body 2", 100, 115), + ("FOOTER", 760, 775), + ]), + _make_page(3, [ + ("Body 3", 100, 115), + # Different text in footer region + ("Final remarks", 760, 775), + ]), + ) + config = HeaderFooterConfig(footer_scan_height=50, repeat_threshold=2) + detection = detect_repeating_headers_footers(doc, config) + + assert "FOOTER" in detection.footer_keys + + result = strip_detected_headers_footers(doc, detection, config) + all_texts = flatten_document_text(result) + + assert "FOOTER" not in all_texts + assert "Final remarks" in all_texts + assert "Body 1" in all_texts + assert "Body 2" in all_texts + assert "Body 3" in all_texts diff --git a/utest/test_spatial_word_sorting.py b/utest/test_spatial_word_sorting.py new file mode 100644 index 0000000..f716a23 --- /dev/null +++ b/utest/test_spatial_word_sorting.py @@ -0,0 +1,611 @@ +"""Unit tests for build_page_structure_from_words() and the spatial_word_sorting config flag.""" + +import pytest + +from DocTest.PdfStructureModels import ( + DocumentStructure, + PageStructure, + StructureExtractionConfig, + TextBlock, + TextLine, + build_page_structure, + build_page_structure_from_words, + flatten_document_words, +) + + +# --------------------------------------------------------------------------- +# Helper +# --------------------------------------------------------------------------- + + +def _make_word_tuple(text, x0, y0, x1, y1, block_no=0, line_no=0, word_no=0): + """Return a tuple in PyMuPDF ``get_text('words')`` format. + + Format: (x0, y0, x1, y1, "word", block_no, line_no, word_no) + """ + return (x0, y0, x1, y1, text, block_no, line_no, word_no) + + +# --------------------------------------------------------------------------- +# 1. Empty / None inputs +# --------------------------------------------------------------------------- + + +def test_empty_words_list(): + """Empty input returns PageStructure with no blocks.""" + page = build_page_structure_from_words(0, [], page_width=612.0, page_height=792.0) + assert isinstance(page, PageStructure) + assert page.page_number == 0 + assert page.blocks == [] + assert page.width == 612.0 + assert page.height == 792.0 + + +def test_none_words_list(): + """None input returns PageStructure with no blocks.""" + page = build_page_structure_from_words(0, None, page_width=612.0, page_height=792.0) + assert isinstance(page, PageStructure) + assert page.blocks == [] + + +# --------------------------------------------------------------------------- +# 2. Single word +# --------------------------------------------------------------------------- + + +def test_single_word(): + """One word produces one block with one line.""" + words = [_make_word_tuple("hello", 10.0, 100.0, 50.0, 112.0)] + page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0) + + assert len(page.blocks) == 1 + block = page.blocks[0] + assert block.line_count == 1 + assert block.lines[0].text == "hello" + assert len(block.lines[0].spans) == 1 + assert block.lines[0].spans[0].text == "hello" + + +# --------------------------------------------------------------------------- +# 3. Single line, multiple words +# --------------------------------------------------------------------------- + + +def test_single_line_multiple_words(): + """Multiple words at the same Y position produce one line, sorted by x0.""" + words = [ + _make_word_tuple("world", 60.0, 100.0, 110.0, 112.0, word_no=1), + _make_word_tuple("hello", 10.0, 100.0, 50.0, 112.0, word_no=0), + _make_word_tuple("!", 120.0, 100.0, 130.0, 112.0, word_no=2), + ] + page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0) + + assert len(page.blocks) == 1 + assert page.blocks[0].lines[0].text == "hello world !" + + +# --------------------------------------------------------------------------- +# 4. Multiple lines +# --------------------------------------------------------------------------- + + +def test_multiple_lines(): + """Words at different Y positions produce separate lines sorted top-to-bottom.""" + words = [ + # Second line (y ~ 200) + _make_word_tuple("second", 10.0, 200.0, 80.0, 212.0), + # First line (y ~ 100) + _make_word_tuple("first", 10.0, 100.0, 60.0, 112.0), + # Third line (y ~ 300) + _make_word_tuple("third", 10.0, 300.0, 70.0, 312.0), + ] + page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0) + + assert len(page.blocks) == 3 + texts = [b.lines[0].text for b in page.blocks] + assert texts == ["first", "second", "third"] + + +# --------------------------------------------------------------------------- +# 5. Multi-column layout +# --------------------------------------------------------------------------- + + +def test_multi_column_layout(): + """Three columns at the same Y range produce words interleaved by Y row. + + This is the key scenario: words from different columns that share the + same vertical position should be grouped into the same line, ordered + left-to-right. + """ + # Row 1 (y=100..112): three columns + words = [ + _make_word_tuple("C1R1", 10.0, 100.0, 60.0, 112.0), + _make_word_tuple("C2R1", 210.0, 100.0, 260.0, 112.0), + _make_word_tuple("C3R1", 410.0, 100.0, 460.0, 112.0), + # Row 2 (y=130..142): three columns + _make_word_tuple("C1R2", 10.0, 130.0, 60.0, 142.0), + _make_word_tuple("C2R2", 210.0, 130.0, 260.0, 142.0), + _make_word_tuple("C3R2", 410.0, 130.0, 460.0, 142.0), + ] + page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0) + + assert len(page.blocks) == 2 + assert page.blocks[0].lines[0].text == "C1R1 C2R1 C3R1" + assert page.blocks[1].lines[0].text == "C1R2 C2R2 C3R2" + + +# --------------------------------------------------------------------------- +# 6. Mixed font sizes (adaptive tolerance) +# --------------------------------------------------------------------------- + + +def test_mixed_font_sizes(): + """Words with different heights at similar Y are grouped using adaptive tolerance. + + Tolerance is min(min_height, word_height) * 0.5. Words that are close + enough vertically should be merged into one line. + """ + # Two words with different heights but overlapping Y midpoints. + # Word A: height 12, midpoint = 106 + # Word B: height 20, midpoint = 110 + # min_height = 12, tolerance = 12 * 0.5 = 6.0 + # |106 - 110| = 4.0 < 6.0 => same line + words = [ + _make_word_tuple("small", 10.0, 100.0, 60.0, 112.0), # height=12, mid=106 + _make_word_tuple("big", 70.0, 100.0, 140.0, 120.0), # height=20, mid=110 + ] + page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0) + + assert len(page.blocks) == 1 + assert page.blocks[0].lines[0].text == "small big" + + +def test_mixed_font_sizes_separate_lines(): + """Words whose midpoints differ more than the adaptive tolerance form separate lines.""" + # Word A: height 10, midpoint = 105 + # Word B: height 10, midpoint = 120 + # tolerance = 10 * 0.5 = 5.0 + # |105 - 120| = 15.0 > 5.0 => different lines + words = [ + _make_word_tuple("line1", 10.0, 100.0, 60.0, 110.0), # mid=105 + _make_word_tuple("line2", 10.0, 115.0, 60.0, 125.0), # mid=120 + ] + page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0) + + assert len(page.blocks) == 2 + + +# --------------------------------------------------------------------------- +# 7. Text normalization +# --------------------------------------------------------------------------- + + +def test_text_normalization_applied(): + """Whitespace collapsing, ligature normalization, and strip edges all work.""" + config = StructureExtractionConfig( + collapse_whitespace=True, + strip_line_edges=True, + normalize_ligatures=True, + ) + # "\ufb01" is the fi ligature + words = [ + _make_word_tuple(" hello ", 10.0, 100.0, 60.0, 112.0), + _make_word_tuple("\ufb01nd", 70.0, 100.0, 120.0, 112.0), + ] + page = build_page_structure_from_words( + 0, words, config=config, page_width=612.0, page_height=792.0 + ) + + assert len(page.blocks) == 1 + assert page.blocks[0].lines[0].text == "hello find" + + +# --------------------------------------------------------------------------- +# 8. Config hash includes spatial_word_sorting +# --------------------------------------------------------------------------- + + +def test_config_hash_includes_spatial(): + """Two configs differing only in spatial_word_sorting hash differently.""" + c1 = StructureExtractionConfig(spatial_word_sorting=False) + c2 = StructureExtractionConfig(spatial_word_sorting=True) + assert hash(c1) != hash(c2) + + +def test_config_hash_same_when_equal(): + """Configs with identical settings hash the same.""" + c1 = StructureExtractionConfig(spatial_word_sorting=True) + c2 = StructureExtractionConfig(spatial_word_sorting=True) + assert hash(c1) == hash(c2) + + +# --------------------------------------------------------------------------- +# 9. Page dimensions from explicit args +# --------------------------------------------------------------------------- + + +def test_page_dimensions_from_args(): + """page_width and page_height params are used directly.""" + page = build_page_structure_from_words( + 0, [], page_width=500.0, page_height=700.0 + ) + assert page.width == 500.0 + assert page.height == 700.0 + + +# --------------------------------------------------------------------------- +# 10. Page dimensions from image_shape + dpi +# --------------------------------------------------------------------------- + + +def test_page_dimensions_from_image_shape(): + """When page_width=0, falls back to image_shape + dpi calculation.""" + # image_shape: (height_px, width_px, channels) + # width = 720 * 72 / 72 = 720.0 + # height = 1080 * 72 / 72 = 1080.0 + page = build_page_structure_from_words( + 0, + [], + page_width=0.0, + page_height=0.0, + dpi=72, + image_shape=(1080, 720, 3), + ) + assert page.width == 720.0 + assert page.height == 1080.0 + + +def test_page_dimensions_from_image_shape_with_higher_dpi(): + """Verify the DPI scaling formula: page_pt = px * 72 / dpi.""" + # 1440px wide at 144 DPI => 1440 * 72 / 144 = 720 points + page = build_page_structure_from_words( + 0, + [], + page_width=0.0, + page_height=0.0, + dpi=144, + image_shape=(2160, 1440, 3), + ) + assert page.width == 720.0 + assert page.height == 1080.0 + + +# --------------------------------------------------------------------------- +# 11. Drop empty lines +# --------------------------------------------------------------------------- + + +def test_drop_empty_lines(): + """Empty words after normalization are dropped when drop_empty_lines=True.""" + config = StructureExtractionConfig(drop_empty_lines=True, strip_line_edges=True) + words = [ + _make_word_tuple(" ", 10.0, 100.0, 60.0, 112.0), # becomes empty after strip + _make_word_tuple("real", 10.0, 200.0, 60.0, 212.0), + ] + page = build_page_structure_from_words( + 0, words, config=config, page_width=612.0, page_height=792.0 + ) + + assert len(page.blocks) == 1 + assert page.blocks[0].lines[0].text == "real" + + +def test_keep_empty_lines_when_disabled(): + """When drop_empty_lines=False, whitespace-only words still produce lines.""" + config = StructureExtractionConfig( + drop_empty_lines=False, + collapse_whitespace=False, + strip_line_edges=False, + ) + words = [ + _make_word_tuple(" ", 10.0, 100.0, 60.0, 112.0), + _make_word_tuple("real", 10.0, 200.0, 60.0, 212.0), + ] + page = build_page_structure_from_words( + 0, words, config=config, page_width=612.0, page_height=792.0 + ) + + assert len(page.blocks) == 2 + + +# --------------------------------------------------------------------------- +# 12. Bbox is union of word bboxes +# --------------------------------------------------------------------------- + + +def test_bbox_is_union_of_word_bboxes(): + """Line bbox is the union of all word bboxes in that line.""" + words = [ + _make_word_tuple("left", 10.0, 100.0, 50.0, 112.0), + _make_word_tuple("right", 200.0, 98.0, 260.0, 115.0), + ] + page = build_page_structure_from_words( + 0, words, config=StructureExtractionConfig(round_precision=None), + page_width=612.0, page_height=792.0, + ) + + assert len(page.blocks) == 1 + bbox = page.blocks[0].lines[0].bbox + # x0 = min(10.0, 200.0) = 10.0 + assert bbox[0] == 10.0 + # y0 = min(100.0, 98.0) = 98.0 + assert bbox[1] == 98.0 + # x1 = max(50.0, 260.0) = 260.0 + assert bbox[2] == 260.0 + # y1 = max(112.0, 115.0) = 115.0 + assert bbox[3] == 115.0 + + +# --------------------------------------------------------------------------- +# 13. Round precision applied +# --------------------------------------------------------------------------- + + +def test_round_precision_applied(): + """Bboxes are rounded per config.round_precision.""" + words = [ + _make_word_tuple("word", 10.12345, 100.6789, 50.99999, 112.11111), + ] + config = StructureExtractionConfig(round_precision=2) + page = build_page_structure_from_words( + 0, words, config=config, page_width=612.0, page_height=792.0, + ) + + bbox = page.blocks[0].lines[0].bbox + assert bbox == (10.12, 100.68, 51.0, 112.11) + + +def test_round_precision_none_no_rounding(): + """When round_precision is None, coordinates are not rounded.""" + words = [ + _make_word_tuple("word", 10.12345, 100.6789, 50.99999, 112.11111), + ] + config = StructureExtractionConfig(round_precision=None) + page = build_page_structure_from_words( + 0, words, config=config, page_width=612.0, page_height=792.0, + ) + + bbox = page.blocks[0].lines[0].bbox + assert bbox == (10.12345, 100.6789, 50.99999, 112.11111) + + +# --------------------------------------------------------------------------- +# 14. Words sorted left to right within a line +# --------------------------------------------------------------------------- + + +def test_words_sorted_left_to_right_within_line(): + """Even if words are added out of order, they come out sorted by x0.""" + words = [ + _make_word_tuple("C", 200.0, 100.0, 220.0, 112.0), + _make_word_tuple("A", 10.0, 100.0, 30.0, 112.0), + _make_word_tuple("B", 100.0, 100.0, 120.0, 112.0), + ] + page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0) + + assert len(page.blocks) == 1 + assert page.blocks[0].lines[0].text == "A B C" + + +# --------------------------------------------------------------------------- +# 15. Spatial vs block: same simple text +# --------------------------------------------------------------------------- + + +def test_spatial_vs_block_same_simple_text(): + """For a simple single-column document, build_page_structure and + build_page_structure_from_words produce the same word sequence when flattened. + """ + # Simulate a simple PDF dict for build_page_structure + pdf_dict = { + "width": 612.0, + "height": 792.0, + "blocks": [ + { + "type": 0, + "bbox": (10.0, 100.0, 200.0, 145.0), + "lines": [ + { + "bbox": (10.0, 100.0, 200.0, 112.0), + "spans": [ + {"text": "hello world", "font": "Arial", "size": 12.0} + ], + }, + { + "bbox": (10.0, 130.0, 200.0, 142.0), + "spans": [ + {"text": "foo bar", "font": "Arial", "size": 12.0} + ], + }, + ], + } + ], + } + + # Simulate equivalent word tuples for build_page_structure_from_words + word_tuples = [ + _make_word_tuple("hello", 10.0, 100.0, 50.0, 112.0, 0, 0, 0), + _make_word_tuple("world", 55.0, 100.0, 100.0, 112.0, 0, 0, 1), + _make_word_tuple("foo", 10.0, 130.0, 40.0, 142.0, 0, 1, 0), + _make_word_tuple("bar", 45.0, 130.0, 80.0, 142.0, 0, 1, 1), + ] + + config = StructureExtractionConfig() + page_block = build_page_structure(0, pdf_dict, config=config) + page_spatial = build_page_structure_from_words( + 0, word_tuples, config=config, page_width=612.0, page_height=792.0, + ) + + # Extract words from both + def _extract_words(page): + words = [] + for block in page.blocks: + for line in block.lines: + words.extend(line.text.split()) + return words + + block_words = _extract_words(page_block) + spatial_words = _extract_words(page_spatial) + assert block_words == spatial_words + + +# --------------------------------------------------------------------------- +# 16. Integration with flatten_document_words +# --------------------------------------------------------------------------- + + +def test_integration_with_flatten_document_words(): + """Build a DocumentStructure from spatial pages and verify flatten_document_words works.""" + words_page1 = [ + _make_word_tuple("page", 10.0, 100.0, 50.0, 112.0), + _make_word_tuple("one", 55.0, 100.0, 90.0, 112.0), + ] + words_page2 = [ + _make_word_tuple("page", 10.0, 100.0, 50.0, 112.0), + _make_word_tuple("two", 55.0, 100.0, 90.0, 112.0), + ] + + config = StructureExtractionConfig() + page1 = build_page_structure_from_words( + 0, words_page1, config=config, page_width=612.0, page_height=792.0, + ) + page2 = build_page_structure_from_words( + 1, words_page2, config=config, page_width=612.0, page_height=792.0, + ) + + doc = DocumentStructure(pages=[page1, page2], config=config) + + flat_words, tokens = flatten_document_words(doc) + assert flat_words == ["page", "one", "page", "two"] + assert len(tokens) == 4 + assert tokens[0].source_page == 0 + assert tokens[2].source_page == 1 + assert tokens[0].word_index == 0 + assert tokens[3].word_index == 3 + + +# --------------------------------------------------------------------------- +# 17. Character replacements applied +# --------------------------------------------------------------------------- + + +def test_character_replacements_applied(): + """Character replacements are applied to word text during normalization.""" + config = StructureExtractionConfig( + character_replacements={"\u00A0": " ", "\u2013": "-"}, + ) + # Non-breaking space within a word, en-dash in another + words = [ + _make_word_tuple("hello\u00A0world", 10.0, 100.0, 100.0, 112.0), + _make_word_tuple("2020\u20132021", 110.0, 100.0, 200.0, 112.0), + ] + page = build_page_structure_from_words( + 0, words, config=config, page_width=612.0, page_height=792.0, + ) + + assert len(page.blocks) == 1 + line_text = page.blocks[0].lines[0].text + # NBSP replaced with space, then words joined + # "hello world" becomes two parts after collapse_whitespace: "hello" "world" + # so the full text depends on how the joining works + assert "\u00A0" not in line_text + assert "\u2013" not in line_text + assert "2020-2021" in line_text + + +# --------------------------------------------------------------------------- +# Additional edge cases +# --------------------------------------------------------------------------- + + +def test_block_index_and_line_index_increment(): + """Block index and global line index are sequential.""" + words = [ + _make_word_tuple("line1", 10.0, 100.0, 60.0, 112.0), + _make_word_tuple("line2", 10.0, 200.0, 60.0, 212.0), + _make_word_tuple("line3", 10.0, 300.0, 60.0, 312.0), + ] + page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0) + + assert len(page.blocks) == 3 + for i, block in enumerate(page.blocks): + assert block.index == i + assert block.lines[0].index == i + + +def test_page_number_is_preserved(): + """The page_number argument is stored in the result.""" + page = build_page_structure_from_words(42, [], page_width=612.0, page_height=792.0) + assert page.page_number == 42 + + +def test_block_bbox_equals_line_bbox(): + """Since each block has exactly one line, the block bbox should match the line bbox.""" + words = [ + _make_word_tuple("hello", 10.0, 100.0, 50.0, 112.0), + _make_word_tuple("world", 55.0, 100.0, 100.0, 112.0), + ] + page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0) + + assert len(page.blocks) == 1 + assert page.blocks[0].bbox == page.blocks[0].lines[0].bbox + + +def test_line_count_property(): + """PageStructure.line_count aggregates across all blocks.""" + words = [ + _make_word_tuple("a", 10.0, 100.0, 30.0, 112.0), + _make_word_tuple("b", 10.0, 200.0, 30.0, 212.0), + _make_word_tuple("c", 10.0, 300.0, 30.0, 312.0), + ] + page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0) + + assert page.line_count == 3 + + +def test_spans_contain_full_line_text(): + """Each line has exactly one span whose text matches the line text.""" + words = [ + _make_word_tuple("alpha", 10.0, 100.0, 60.0, 112.0), + _make_word_tuple("beta", 70.0, 100.0, 120.0, 112.0), + ] + page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0) + + line = page.blocks[0].lines[0] + assert len(line.spans) == 1 + assert line.spans[0].text == line.text + assert line.spans[0].font is None + assert line.spans[0].size == 0.0 + + +def test_fonts_set_is_empty(): + """Spatial word extraction does not have font info, so fonts set is empty.""" + words = [_make_word_tuple("test", 10.0, 100.0, 50.0, 112.0)] + page = build_page_structure_from_words(0, words, page_width=612.0, page_height=792.0) + + assert page.blocks[0].lines[0].fonts == set() + + +def test_whitespace_replacement_used(): + """The whitespace_replacement from config is used to join words.""" + config = StructureExtractionConfig(whitespace_replacement="|") + words = [ + _make_word_tuple("a", 10.0, 100.0, 30.0, 112.0), + _make_word_tuple("b", 40.0, 100.0, 60.0, 112.0), + ] + page = build_page_structure_from_words( + 0, words, config=config, page_width=612.0, page_height=792.0, + ) + + assert page.blocks[0].lines[0].text == "a|b" + + +def test_default_config_used_when_none(): + """When config is None, a default StructureExtractionConfig is used.""" + words = [_make_word_tuple("hello", 10.0, 100.0, 50.0, 112.0)] + page = build_page_structure_from_words(0, words, config=None, page_width=612.0, page_height=792.0) + + assert len(page.blocks) == 1 + assert page.blocks[0].lines[0].text == "hello" diff --git a/utest/test_structure_report.py b/utest/test_structure_report.py new file mode 100644 index 0000000..0b8ef8e --- /dev/null +++ b/utest/test_structure_report.py @@ -0,0 +1,875 @@ +"""Comprehensive unit tests for DocTest.StructureReportBuilder (ADR-003). + +Tests cover: + - Passing results returning empty strings + - Single difference types (missing, extra, mismatch, geometry) + - Hunk grouping (adjacent, separated, merge boundary) + - Context lines with/without reference_texts + - Summary statistics + - Document-level and word-level differences + - Text truncation + - HTML escaping (XSS safety) + - Large results with hunk collapse + - Metadata rendering + - Plain-text report structure + - Internal helpers (_classify_diff_type, _group_into_hunks, _escape, _truncate) +""" + +import pytest + +from DocTest.PdfStructureComparator import ( + DocumentTextDifference, + DocumentWordDifference, + LineDifference, + StructureComparisonResult, +) +from DocTest.StructureReportBuilder import ( + MAX_HUNKS_BEFORE_COLLAPSE, + MAX_TEXT_DISPLAY_LENGTH, + ReportMetadata, + ReportSummary, + _classify_diff_type, + _collect_all_diffs, + _compute_summary, + _escape, + _group_into_hunks, + _truncate, + build_structure_report, + build_structure_report_plain_text, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_passing_result(): + """Return a StructureComparisonResult with passed=True.""" + return StructureComparisonResult() + + +def _make_result_with_page_diffs(diffs, page=1): + """Return a failing StructureComparisonResult with the given LineDifferences.""" + result = StructureComparisonResult() + for d in diffs: + result.add_difference(d) + return result + + +def _make_line_diff(diff_type, *, page=1, ref_text=None, cand_text=None, + deltas=None, reference_index=None, candidate_index=None, + message=None): + """Convenience factory for LineDifference.""" + if message is None: + message = f"Synthetic {diff_type}" + return LineDifference( + page=page, + diff_type=diff_type, + message=message, + ref_text=ref_text, + cand_text=cand_text, + deltas=deltas, + reference_index=reference_index, + candidate_index=candidate_index, + ) + + +# =========================================================================== +# 1 & 2 - Passing result returns empty string +# =========================================================================== + + +class TestPassingResult: + + def test_html_report_empty_for_passing_result(self): + result = _make_passing_result() + assert result.passed is True + html = build_structure_report(result) + assert html == "" + + def test_plain_report_empty_for_passing_result(self): + result = _make_passing_result() + plain = build_structure_report_plain_text(result) + assert plain == "" + + +# =========================================================================== +# 3-6 - Single differences +# =========================================================================== + + +class TestSingleDifferences: + + def test_html_report_single_missing_line(self): + diff = _make_line_diff( + "missing_line", + ref_text="vanished line", + reference_index=0, + ) + result = _make_result_with_page_diffs([diff]) + html = build_structure_report(result) + + assert "#f8d7da" in html, "Missing line should use red background #f8d7da" + assert "-" in html, "Missing line should display '-' symbol" + assert "vanished line" in html + + def test_html_report_single_extra_line(self): + diff = _make_line_diff( + "extra_line", + cand_text="new line appeared", + candidate_index=0, + ) + result = _make_result_with_page_diffs([diff]) + html = build_structure_report(result) + + assert "#d4edda" in html, "Extra line should use green background #d4edda" + assert "+" in html, "Extra line should display '+' symbol" + assert "new line appeared" in html + + def test_html_report_single_text_mismatch(self): + diff = _make_line_diff( + "text_mismatch", + ref_text="foo", + cand_text="bar", + reference_index=0, + candidate_index=0, + ) + result = _make_result_with_page_diffs([diff]) + html = build_structure_report(result) + + assert "#fff3cd" in html, "Text mismatch should use yellow background #fff3cd" + assert "ref:" in html, "Text mismatch should show 'ref:' label" + assert "cand:" in html, "Text mismatch should show 'cand:' label" + assert "foo" in html + assert "bar" in html + + def test_html_report_single_geometry_mismatch(self): + diff = _make_line_diff( + "geometry_mismatch", + ref_text="shifted text", + deltas={"left": 5.0}, + reference_index=0, + ) + result = _make_result_with_page_diffs([diff]) + html = build_structure_report(result) + + assert "#e2e3e5" in html, "Geometry mismatch should use grey background #e2e3e5" + # The delta symbol U+0394 + assert "\u0394" in html or "Δ" in html or "Δ" in html, \ + "Geometry mismatch should display delta symbol" + + +# =========================================================================== +# 7-9 - Grouping / Hunks +# =========================================================================== + + +class TestHunkGrouping: + + def test_adjacent_diffs_grouped_into_one_hunk(self): + """5 consecutive LineDifferences at indices 10-14 produce 1 hunk.""" + diffs = [ + _make_line_diff("missing_line", ref_text=f"line {i}", + reference_index=i) + for i in range(10, 15) + ] + result = _make_result_with_page_diffs(diffs) + html = build_structure_report(result) + + assert "Hunk 1" in html + assert "Hunk 2" not in html + + def test_separated_diffs_produce_separate_hunks(self): + """Diffs at indices 5 and 50 produce two separate hunks.""" + diff_a = _make_line_diff("missing_line", ref_text="early", + reference_index=5) + diff_b = _make_line_diff("extra_line", cand_text="late", + candidate_index=50) + result = _make_result_with_page_diffs([diff_a, diff_b]) + html = build_structure_report(result) + + assert "Hunk 1" in html + assert "Hunk 2" in html + + def test_gap_at_merge_boundary(self): + """context_lines=3: merge_threshold = 2*3+1 = 7. + + Diffs at index 10 and 17 (gap=7) -> merged into 1 hunk. + Diffs at index 10 and 18 (gap=8) -> 2 separate hunks. + """ + # Gap = 7 => 1 hunk + d1 = _make_line_diff("missing_line", ref_text="a", reference_index=10) + d2 = _make_line_diff("missing_line", ref_text="b", reference_index=17) + result_merged = _make_result_with_page_diffs([d1, d2]) + html_merged = build_structure_report(result_merged, context_lines=3) + assert "Hunk 1" in html_merged + assert "Hunk 2" not in html_merged + + # Gap = 8 => 2 hunks + d3 = _make_line_diff("missing_line", ref_text="a", reference_index=10) + d4 = _make_line_diff("missing_line", ref_text="b", reference_index=18) + result_split = _make_result_with_page_diffs([d3, d4]) + html_split = build_structure_report(result_split, context_lines=3) + assert "Hunk 1" in html_split + assert "Hunk 2" in html_split + + +# =========================================================================== +# 10-11 - Context +# =========================================================================== + + +class TestContext: + + def test_context_shown_when_texts_provided(self): + """When reference_texts is provided, context words appear in HTML.""" + ref_texts = [f"word_{i}" for i in range(20)] + diff = _make_line_diff("missing_line", ref_text="word_10", + reference_index=10) + result = _make_result_with_page_diffs([diff]) + html = build_structure_report(result, reference_texts=ref_texts, + context_lines=3) + + # Context before should include words near index 10 + assert "word_7" in html or "word_8" in html or "word_9" in html, \ + "Context before the diff should be visible" + # Context after + assert "word_11" in html or "word_12" in html or "word_13" in html, \ + "Context after the diff should be visible" + + def test_no_context_when_texts_not_provided(self): + """Without reference_texts, no context divs with '...' appear.""" + diff = _make_line_diff("missing_line", ref_text="gone", + reference_index=10) + result = _make_result_with_page_diffs([diff]) + html = build_structure_report(result, reference_texts=None) + + # The "..." context wrapper should not appear + # (the only "..." might come from truncation, but there should be + # no context div with the pattern ...word...) + assert "color:#999" not in html + + +# =========================================================================== +# 12-13 - Summary statistics +# =========================================================================== + + +class TestSummaryStatistics: + + def test_summary_counts_correct(self): + """Mix of diff types yields correct ReportSummary counts.""" + result = StructureComparisonResult() + result.add_difference(_make_line_diff("missing_line", ref_text="a", + reference_index=0)) + result.add_difference(_make_line_diff("missing_line", ref_text="b", + reference_index=1)) + result.add_difference(_make_line_diff("extra_line", cand_text="c", + candidate_index=2)) + result.add_difference(_make_line_diff("text_mismatch", ref_text="d", + cand_text="e", + reference_index=3)) + result.add_difference(_make_line_diff("geometry_mismatch", + ref_text="f", + deltas={"left": 1.0}, + reference_index=4)) + + summary = _compute_summary(result) + + assert summary.missing_count == 2 + assert summary.extra_count == 1 + assert summary.mismatch_count == 1 + assert summary.geometry_count == 1 + assert summary.other_count == 0 + assert summary.total_differences == 5 + + def test_summary_includes_word_diffs(self): + """Word differences are counted in summary statistics.""" + result = StructureComparisonResult() + result.add_word_difference(DocumentWordDifference( + diff_type="missing_words", + message="words gone", + ref_words=["hello"], + ref_start_index=0, + ref_end_index=1, + )) + result.add_word_difference(DocumentWordDifference( + diff_type="extra_words", + message="words added", + cand_words=["world"], + cand_start_index=0, + cand_end_index=1, + )) + result.add_word_difference(DocumentWordDifference( + diff_type="word_mismatch", + message="words changed", + ref_words=["old"], + cand_words=["new"], + ref_start_index=5, + ref_end_index=6, + cand_start_index=5, + cand_end_index=6, + )) + + summary = _compute_summary(result) + + assert summary.missing_count == 1 + assert summary.extra_count == 1 + assert summary.mismatch_count == 1 + assert summary.total_differences == 3 + + +# =========================================================================== +# 14-15 - Document-level and word-level +# =========================================================================== + + +class TestDocumentAndWordLevel: + + def test_document_level_diffs_in_report(self): + """DocumentTextDifference items produce 'Document (text-only)' section.""" + result = StructureComparisonResult() + result.add_document_difference(DocumentTextDifference( + diff_type="missing_text", + message="Text missing: hello", + ref_text="hello", + ref_index=0, + )) + + html = build_structure_report(result) + assert "Document (text-only)" in html + + def test_word_level_diffs_in_report(self): + """DocumentWordDifference items produce 'Document (word-level)' section.""" + result = StructureComparisonResult() + result.add_word_difference(DocumentWordDifference( + diff_type="word_mismatch", + message="Word changed", + ref_words=["alpha"], + cand_words=["beta"], + ref_start_index=0, + ref_end_index=1, + cand_start_index=0, + cand_end_index=1, + )) + + html = build_structure_report(result) + assert "Document (word-level)" in html + + +# =========================================================================== +# 16 - Truncation +# =========================================================================== + + +class TestTruncation: + + def test_long_text_truncated(self): + """Diff with 500-char ref_text is truncated in HTML output.""" + long_text = "x" * 500 + diff = _make_line_diff("missing_line", ref_text=long_text, + reference_index=0) + result = _make_result_with_page_diffs([diff]) + html = build_structure_report(result) + + # The full 500-char text should NOT appear in the report + assert long_text not in html + # Instead the truncated version with "..." should + assert "..." in html + # The output should contain at most MAX_TEXT_DISPLAY_LENGTH chars + # of the original text (minus 3 for "...") + truncated = long_text[:MAX_TEXT_DISPLAY_LENGTH - 3] + "..." + assert _escape(truncated) in html + + +# =========================================================================== +# 17 - HTML safety +# =========================================================================== + + +class TestHTMLSafety: + + def test_html_special_chars_escaped(self): + """XSS payload in diff text is escaped, not rendered raw.""" + xss = "" + diff = _make_line_diff("missing_line", ref_text=xss, + reference_index=0) + result = _make_result_with_page_diffs([diff]) + html = build_structure_report(result) + + assert "