Skip to content

Commit 1c9f6d2

Browse files
committed
fix(#73): introduce ExtractionWarning type; wire date-propagation and credit-card-skip events
1 parent ff6a61c commit 1c9f6d2

8 files changed

Lines changed: 157 additions & 26 deletions

File tree

packages/parser-core/src/bankstatements_core/domain/models/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from __future__ import annotations
44

55
from bankstatements_core.domain.models.extraction_result import ExtractionResult
6+
from bankstatements_core.domain.models.extraction_warning import ExtractionWarning
67
from bankstatements_core.domain.models.transaction import Transaction
78

8-
__all__ = ["Transaction", "ExtractionResult"]
9+
__all__ = ["Transaction", "ExtractionResult", "ExtractionWarning"]

packages/parser-core/src/bankstatements_core/domain/models/extraction_result.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from dataclasses import dataclass, field
66
from pathlib import Path
77

8+
from bankstatements_core.domain.models.extraction_warning import ExtractionWarning
89
from bankstatements_core.domain.models.transaction import Transaction
910

1011

@@ -17,12 +18,13 @@ class ExtractionResult:
1718
page_count: Total number of pages in the source PDF
1819
iban: IBAN found in the document header, or None if not detected
1920
source_file: Path to the source PDF file
20-
warnings: Document-level non-fatal events (e.g. "credit card detected,
21-
skipped"). Distinct from per-row Transaction.extraction_warnings.
21+
warnings: Document-level non-fatal events (e.g. credit card detected,
22+
skipped). Distinct from per-row Transaction.extraction_warnings.
23+
In-memory only — not written to output files.
2224
"""
2325

2426
transactions: list[Transaction]
2527
page_count: int
2628
iban: str | None
2729
source_file: Path
28-
warnings: list[str] = field(default_factory=list)
30+
warnings: list[ExtractionWarning] = field(default_factory=list)
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
"""ExtractionWarning domain model for structured pipeline warning events."""
2+
3+
from __future__ import annotations
4+
5+
from dataclasses import dataclass, field
6+
7+
# Machine-readable warning codes
8+
CODE_DATE_PROPAGATED = "DATE_PROPAGATED"
9+
CODE_CREDIT_CARD_SKIPPED = "CREDIT_CARD_SKIPPED"
10+
11+
12+
@dataclass
13+
class ExtractionWarning:
14+
"""A structured warning event produced during PDF extraction.
15+
16+
Attributes:
17+
code: Machine-readable identifier (use CODE_* constants).
18+
message: Human-readable description of the event.
19+
page: Page number the warning relates to, or None if document-level.
20+
21+
Examples:
22+
>>> w = ExtractionWarning(code=CODE_DATE_PROPAGATED,
23+
... message="date propagated from previous row ('01 Jan 2024')")
24+
>>> w.code
25+
'DATE_PROPAGATED'
26+
"""
27+
28+
code: str
29+
message: str
30+
page: int | None = field(default=None)
31+
32+
def to_dict(self) -> dict:
33+
"""Serialise to a plain dict for JSON encoding."""
34+
return {"code": self.code, "message": self.message, "page": self.page}
35+
36+
@classmethod
37+
def from_dict(cls, data: dict) -> "ExtractionWarning":
38+
"""Deserialise from a plain dict."""
39+
return cls(
40+
code=data["code"],
41+
message=data["message"],
42+
page=data.get("page"),
43+
)

packages/parser-core/src/bankstatements_core/domain/models/transaction.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from decimal import Decimal, InvalidOperation
1111

1212
from bankstatements_core.domain.currency import strip_currency_symbols
13+
from bankstatements_core.domain.models.extraction_warning import ExtractionWarning
1314

1415

1516
@dataclass
@@ -52,7 +53,7 @@ class Transaction:
5253
additional_fields: dict[str, str] = field(default_factory=dict)
5354
source_page: int | None = None
5455
confidence_score: float = 1.0
55-
extraction_warnings: list[str] = field(default_factory=list)
56+
extraction_warnings: list[ExtractionWarning] = field(default_factory=list)
5657

5758
def is_debit(self) -> bool:
5859
"""Check if transaction is a debit (money out).
@@ -246,9 +247,22 @@ def from_dict(cls, data: dict[str, str | None]) -> Transaction:
246247
raw_confidence = data.get("confidence_score")
247248
confidence_score = float(raw_confidence) if raw_confidence is not None else 1.0
248249
raw_warnings = data.get("extraction_warnings")
249-
extraction_warnings = (
250-
json.loads(raw_warnings) if raw_warnings is not None else []
251-
)
250+
if raw_warnings is not None:
251+
parsed = (
252+
json.loads(raw_warnings)
253+
if isinstance(raw_warnings, str)
254+
else raw_warnings
255+
)
256+
extraction_warnings = [
257+
(
258+
ExtractionWarning.from_dict(w)
259+
if isinstance(w, dict)
260+
else ExtractionWarning(code="UNKNOWN", message=str(w))
261+
)
262+
for w in parsed
263+
]
264+
else:
265+
extraction_warnings = []
252266

253267
return cls(
254268
date=date or "",
@@ -313,7 +327,9 @@ def to_dict(self, currency_symbol: str = "€") -> dict[str, str | None]:
313327
str(self.source_page) if self.source_page is not None else None
314328
)
315329
result["confidence_score"] = str(self.confidence_score)
316-
result["extraction_warnings"] = json.dumps(self.extraction_warnings)
330+
result["extraction_warnings"] = json.dumps(
331+
[w.to_dict() for w in self.extraction_warnings]
332+
)
317333

318334
# Add any additional fields
319335
result.update(self.additional_fields)

packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@
1515

1616
from bankstatements_core.domain import ExtractionResult
1717
from bankstatements_core.domain.converters import dicts_to_transactions
18+
from bankstatements_core.domain.models.extraction_warning import (
19+
CODE_CREDIT_CARD_SKIPPED,
20+
ExtractionWarning,
21+
)
1822
from bankstatements_core.extraction.iban_extractor import IBANExtractor
1923
from bankstatements_core.extraction.page_header_analyser import PageHeaderAnalyser
2024
from bankstatements_core.extraction.row_builder import RowBuilder
@@ -112,7 +116,12 @@ def extract(self, pdf_path: Path) -> ExtractionResult:
112116
page_count=len(pdf.pages),
113117
iban=None,
114118
source_file=pdf_path,
115-
warnings=["credit card statement detected, skipped"],
119+
warnings=[
120+
ExtractionWarning(
121+
code=CODE_CREDIT_CARD_SKIPPED,
122+
message="credit card statement detected, skipped",
123+
)
124+
],
116125
)
117126

118127
if iban is None and page_num == 1:

packages/parser-core/src/bankstatements_core/extraction/row_post_processor.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,16 @@
66

77
from __future__ import annotations
88

9+
import json
910
import logging
1011
import re
1112
from datetime import datetime
1213
from typing import TYPE_CHECKING
1314

15+
from bankstatements_core.domain.models.extraction_warning import (
16+
CODE_DATE_PROPAGATED,
17+
ExtractionWarning,
18+
)
1419
from bankstatements_core.extraction.column_identifier import ColumnTypeIdentifier
1520

1621
if TYPE_CHECKING:
@@ -87,6 +92,11 @@ def process(self, row: dict, current_date: str) -> str:
8792
if not current_date:
8893
current_date = fallback_date
8994
self._last_source = "propagated"
95+
warning = ExtractionWarning(
96+
code=CODE_DATE_PROPAGATED,
97+
message=f"date propagated from previous row ('{fallback_date}')",
98+
)
99+
row["extraction_warnings"] = json.dumps([warning.to_dict()])
90100

91101
# Metadata tagging
92102
row["Filename"] = self._filename

packages/parser-core/tests/domain/test_transaction.py

Lines changed: 59 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@
66

77
import pytest
88

9+
from bankstatements_core.domain.models.extraction_warning import (
10+
CODE_DATE_PROPAGATED,
11+
ExtractionWarning,
12+
)
913
from bankstatements_core.domain.models.transaction import Transaction
1014

1115

@@ -602,17 +606,21 @@ def test_extraction_warnings_defaults_to_empty_list(self):
602606
assert tx.extraction_warnings == []
603607

604608
def test_extraction_warnings_can_be_set(self):
605-
"""TXEN-02: Transaction(extraction_warnings=['missing balance']) stores value."""
609+
"""TXEN-02: Transaction(extraction_warnings=[ExtractionWarning(...)]) stores value."""
610+
w = ExtractionWarning(
611+
code=CODE_DATE_PROPAGATED,
612+
message="date propagated from previous row ('01 Jan 2024')",
613+
)
606614
tx = Transaction(
607615
date="01/01/2024",
608616
details="Test",
609617
debit=None,
610618
credit="10.00",
611619
balance="100.00",
612620
filename="test.pdf",
613-
extraction_warnings=["missing balance"],
621+
extraction_warnings=[w],
614622
)
615-
assert tx.extraction_warnings == ["missing balance"]
623+
assert tx.extraction_warnings == [w]
616624

617625
def test_extraction_warnings_no_shared_mutable_default(self):
618626
"""TXEN-02: Two Transaction() instances have separate extraction_warnings lists."""
@@ -632,21 +640,36 @@ def test_extraction_warnings_no_shared_mutable_default(self):
632640
balance="120.00",
633641
filename="test.pdf",
634642
)
635-
tx1.extraction_warnings.append("warning")
643+
tx1.extraction_warnings.append(
644+
ExtractionWarning(code=CODE_DATE_PROPAGATED, message="test")
645+
)
636646
assert tx2.extraction_warnings == []
637647

638648
def test_to_dict_extraction_warnings_serialises_as_json_string(self):
639-
"""TXEN-02: to_dict() with extraction_warnings=['missing balance'] → JSON string."""
649+
"""TXEN-02: to_dict() with an ExtractionWarning → JSON string of list of dicts."""
650+
w = ExtractionWarning(
651+
code=CODE_DATE_PROPAGATED,
652+
message="date propagated from previous row ('01 Jan 2024')",
653+
)
640654
tx = Transaction(
641655
date="01/01/2024",
642656
details="Test",
643657
debit=None,
644658
credit="10.00",
645659
balance="100.00",
646660
filename="test.pdf",
647-
extraction_warnings=["missing balance"],
661+
extraction_warnings=[w],
648662
)
649-
assert tx.to_dict()["extraction_warnings"] == '["missing balance"]'
663+
import json
664+
665+
serialised = json.loads(tx.to_dict()["extraction_warnings"])
666+
assert serialised == [
667+
{
668+
"code": CODE_DATE_PROPAGATED,
669+
"message": "date propagated from previous row ('01 Jan 2024')",
670+
"page": None,
671+
}
672+
]
650673

651674
def test_to_dict_extraction_warnings_empty_serialises_as_json_array(self):
652675
"""TXEN-02: to_dict() with extraction_warnings=[] → '[]'."""
@@ -661,16 +684,25 @@ def test_to_dict_extraction_warnings_empty_serialises_as_json_array(self):
661684
assert tx.to_dict()["extraction_warnings"] == "[]"
662685

663686
def test_from_dict_extraction_warnings_parses_json_string(self):
664-
"""TXEN-02: from_dict({'extraction_warnings': '["missing balance"]'}) → list."""
687+
"""TXEN-02: from_dict with a JSON-encoded ExtractionWarning → list[ExtractionWarning]."""
688+
import json
689+
690+
w = {
691+
"code": CODE_DATE_PROPAGATED,
692+
"message": "date propagated from previous row ('01 Jan 2024')",
693+
"page": None,
694+
}
665695
tx = Transaction.from_dict(
666696
{
667697
"Date": "01/01/2024",
668698
"Details": "Test",
669699
"Filename": "test.pdf",
670-
"extraction_warnings": '["missing balance"]',
700+
"extraction_warnings": json.dumps([w]),
671701
}
672702
)
673-
assert tx.extraction_warnings == ["missing balance"]
703+
assert len(tx.extraction_warnings) == 1
704+
assert isinstance(tx.extraction_warnings[0], ExtractionWarning)
705+
assert tx.extraction_warnings[0].code == CODE_DATE_PROPAGATED
674706

675707
def test_from_dict_extraction_warnings_absent_defaults_to_empty_list(self):
676708
"""TXEN-02: from_dict({}) (key absent) → extraction_warnings == []."""
@@ -684,26 +716,35 @@ def test_from_dict_extraction_warnings_absent_defaults_to_empty_list(self):
684716

685717
def test_extraction_warnings_roundtrip(self):
686718
"""TXEN-02: from_dict(tx.to_dict()) preserves extraction_warnings."""
719+
w = ExtractionWarning(
720+
code=CODE_DATE_PROPAGATED,
721+
message="date propagated from previous row ('01 Jan 2024')",
722+
)
687723
original = Transaction(
688724
date="01/01/2024",
689725
details="Test",
690726
debit=None,
691727
credit="10.00",
692728
balance="100.00",
693729
filename="test.pdf",
694-
extraction_warnings=["missing balance"],
730+
extraction_warnings=[w],
695731
)
696732
restored = Transaction.from_dict(original.to_dict())
697-
assert restored.extraction_warnings == ["missing balance"]
733+
assert len(restored.extraction_warnings) == 1
734+
assert restored.extraction_warnings[0].code == CODE_DATE_PROPAGATED
735+
assert restored.extraction_warnings[0].message == w.message
698736

699737
def test_extraction_warnings_not_in_additional_fields(self):
700738
"""TXEN-02: extraction_warnings JSON key is gated by standard_keys — not absorbed into additional_fields."""
739+
import json
740+
741+
w = {"code": CODE_DATE_PROPAGATED, "message": "test", "page": None}
701742
tx = Transaction.from_dict(
702743
{
703744
"Date": "01/01/2024",
704745
"Details": "Test",
705746
"Filename": "test.pdf",
706-
"extraction_warnings": '["x"]',
747+
"extraction_warnings": json.dumps([w]),
707748
}
708749
)
709750
assert "extraction_warnings" not in tx.additional_fields
@@ -822,12 +863,15 @@ def test_full_roundtrip_all_three_enrichment_fields(self):
822863
filename="statement.pdf",
823864
source_page=3,
824865
confidence_score=0.8,
825-
extraction_warnings=["missing balance"],
866+
extraction_warnings=[
867+
ExtractionWarning(code=CODE_DATE_PROPAGATED, message="missing balance")
868+
],
826869
)
827870
restored = Transaction.from_dict(original.to_dict())
828871
assert restored.source_page == 3
829872
assert restored.confidence_score == 0.8
830-
assert restored.extraction_warnings == ["missing balance"]
873+
assert len(restored.extraction_warnings) == 1
874+
assert restored.extraction_warnings[0].code == CODE_DATE_PROPAGATED
831875

832876
def test_backward_compat_old_dict_without_new_keys(self):
833877
"""TXEN-04: Old dict without new keys → from_dict() succeeds with defaults."""

packages/parser-core/tests/test_credit_card_detection.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@
88
import pytest
99

1010
from bankstatements_core.domain import ExtractionResult
11+
from bankstatements_core.domain.models.extraction_warning import (
12+
CODE_CREDIT_CARD_SKIPPED,
13+
ExtractionWarning,
14+
)
1115
from bankstatements_core.extraction.pdf_extractor import PDFTableExtractor
1216

1317
# Test columns configuration
@@ -359,4 +363,6 @@ def test_credit_card_early_return_produces_extraction_result_with_warning(
359363
assert len(result.transactions) == 0
360364
assert result.iban is None
361365
assert len(result.warnings) > 0
362-
assert "credit card" in result.warnings[0].lower()
366+
assert isinstance(result.warnings[0], ExtractionWarning)
367+
assert result.warnings[0].code == CODE_CREDIT_CARD_SKIPPED
368+
assert "credit card" in result.warnings[0].message.lower()

0 commit comments

Comments
 (0)