Skip to content

Commit f978baa

Browse files
authored
Merge pull request #39 from longieirl/fix/29-classifier-registry
fix(#29): add ClassifierRegistry with explicit priorities
2 parents f4ce3c0 + dee335a commit f978baa

2 files changed

Lines changed: 184 additions & 26 deletions

File tree

packages/parser-core/src/bankstatements_core/extraction/row_classifiers.py

Lines changed: 68 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import re
1111
from abc import ABC, abstractmethod
12+
from typing import Sequence
1213

1314
from bankstatements_core.extraction.column_identifier import ColumnTypeIdentifier
1415

@@ -349,37 +350,78 @@ def _do_classify(
349350
return "metadata"
350351

351352

353+
class ClassifierRegistry:
354+
"""Wires a RowClassifier chain from an explicit priority list.
355+
356+
Priorities are integers; lower = checked first.
357+
No two classifiers may share a priority (raises ValueError at construction).
358+
build_chain() returns the standard RowClassifier head — callers are unchanged.
359+
"""
360+
361+
def __init__(
362+
self,
363+
classifiers: Sequence[tuple[int, type[RowClassifier]]],
364+
) -> None:
365+
"""
366+
Args:
367+
classifiers: Sequence of (priority, classifier_class) pairs.
368+
369+
Raises:
370+
ValueError: If any two classifiers share the same priority.
371+
TypeError: If a class is not a RowClassifier subclass.
372+
"""
373+
seen_priorities: dict[int, str] = {}
374+
for priority, cls in classifiers:
375+
if not (isinstance(cls, type) and issubclass(cls, RowClassifier)):
376+
raise TypeError(f"{cls!r} is not a RowClassifier subclass")
377+
if priority in seen_priorities:
378+
raise ValueError(
379+
f"priority {priority} already assigned to "
380+
f"{seen_priorities[priority]}, cannot also assign to "
381+
f"{cls.__name__}"
382+
)
383+
seen_priorities[priority] = cls.__name__
384+
385+
self._classifiers: list[tuple[int, type[RowClassifier]]] = sorted(
386+
classifiers, key=lambda x: x[0]
387+
)
388+
389+
def build_chain(self) -> RowClassifier:
390+
"""Instantiate classifiers in priority order, wire set_next(), return head."""
391+
instances = [cls() for _, cls in self._classifiers]
392+
for i in range(len(instances) - 1):
393+
instances[i].set_next(instances[i + 1])
394+
return instances[0]
395+
396+
def get_priority_order(self) -> list[tuple[int, str]]:
397+
"""Return [(priority, class_name), ...] in execution order."""
398+
return [(priority, cls.__name__) for priority, cls in self._classifiers]
399+
400+
352401
def create_row_classifier_chain() -> RowClassifier:
353402
"""
354403
Create and configure the chain of row classifiers.
355404
356-
The order matters - classifiers are applied in sequence:
357-
1. HeaderMetadataClassifier - Detect column headers first
358-
2. AdministrativeClassifier - Detect administrative entries
359-
3. ReferenceCodeClassifier - Detect reference codes
360-
4. FXContinuationClassifier - Detect FX continuation lines
361-
5. TimestampMetadataClassifier - Detect timestamps
362-
6. TransactionClassifier - Detect transactions
363-
7. DefaultMetadataClassifier - Catch-all for anything else
405+
Priority order (0 = checked first):
406+
0 HeaderMetadataClassifier column headers and field labels
407+
1 AdministrativeClassifier — BALANCE FORWARD, Lending @
408+
2 ReferenceCodeClassifier — IE123456 patterns
409+
3 FXContinuationClassifier FX rates, fees, exchange lines
410+
4 TimestampMetadataClassifier — 01JAN2023 TIME 14:30
411+
5 TransactionClassifier — debit/credit/date combinations
412+
6 DefaultMetadataClassifier — catch-all fallback
364413
365414
Returns:
366415
The head of the classifier chain
367416
"""
368-
# Create classifiers
369-
header_classifier = HeaderMetadataClassifier()
370-
admin_classifier = AdministrativeClassifier()
371-
reference_classifier = ReferenceCodeClassifier()
372-
fx_classifier = FXContinuationClassifier()
373-
timestamp_classifier = TimestampMetadataClassifier()
374-
transaction_classifier = TransactionClassifier()
375-
default_classifier = DefaultMetadataClassifier()
376-
377-
# Build the chain
378-
header_classifier.set_next(admin_classifier)
379-
admin_classifier.set_next(reference_classifier)
380-
reference_classifier.set_next(fx_classifier)
381-
fx_classifier.set_next(timestamp_classifier)
382-
timestamp_classifier.set_next(transaction_classifier)
383-
transaction_classifier.set_next(default_classifier)
384-
385-
return header_classifier
417+
return ClassifierRegistry(
418+
[
419+
(0, HeaderMetadataClassifier),
420+
(1, AdministrativeClassifier),
421+
(2, ReferenceCodeClassifier),
422+
(3, FXContinuationClassifier),
423+
(4, TimestampMetadataClassifier),
424+
(5, TransactionClassifier),
425+
(6, DefaultMetadataClassifier),
426+
]
427+
).build_chain()

packages/parser-core/tests/extraction/test_row_classifiers.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from bankstatements_core.extraction.row_classifiers import (
88
AdministrativeClassifier,
9+
ClassifierRegistry,
910
DefaultMetadataClassifier,
1011
FXContinuationClassifier,
1112
HeaderMetadataClassifier,
@@ -390,3 +391,118 @@ def test_looks_like_date(self):
390391
assert classifier._looks_like_date("15 December") is True # Full month name
391392
assert classifier._looks_like_date("01JAN2023") is True
392393
assert classifier._looks_like_date("Not a date") is False
394+
395+
396+
class TestClassifierRegistry:
397+
"""Tests for ClassifierRegistry."""
398+
399+
def test_classifier_priority_order(self):
400+
"""get_priority_order() reflects the declared priority sequence."""
401+
registry = ClassifierRegistry(
402+
[
403+
(0, HeaderMetadataClassifier),
404+
(1, AdministrativeClassifier),
405+
(2, ReferenceCodeClassifier),
406+
(3, FXContinuationClassifier),
407+
(4, TimestampMetadataClassifier),
408+
(5, TransactionClassifier),
409+
(6, DefaultMetadataClassifier),
410+
]
411+
)
412+
order = registry.get_priority_order()
413+
assert order[0] == (0, "HeaderMetadataClassifier")
414+
assert order[5] == (5, "TransactionClassifier")
415+
assert order[6] == (6, "DefaultMetadataClassifier")
416+
417+
def test_duplicate_priority_raises(self):
418+
"""Duplicate priorities raise ValueError at construction time."""
419+
with pytest.raises(ValueError, match="priority 0 already assigned"):
420+
ClassifierRegistry(
421+
[
422+
(0, HeaderMetadataClassifier),
423+
(0, TransactionClassifier),
424+
]
425+
)
426+
427+
def test_non_classifier_subclass_raises(self):
428+
"""Passing a non-RowClassifier class raises TypeError."""
429+
with pytest.raises(TypeError):
430+
ClassifierRegistry([(0, object)]) # type: ignore[list-item]
431+
432+
def test_build_chain_returns_head(self):
433+
"""build_chain() returns a RowClassifier instance."""
434+
registry = ClassifierRegistry(
435+
[
436+
(0, HeaderMetadataClassifier),
437+
(1, DefaultMetadataClassifier),
438+
]
439+
)
440+
head = registry.build_chain()
441+
assert isinstance(head, RowClassifier)
442+
assert isinstance(head, HeaderMetadataClassifier)
443+
444+
def test_priorities_sorted_regardless_of_input_order(self):
445+
"""Registry sorts by priority even if input is unordered."""
446+
registry = ClassifierRegistry(
447+
[
448+
(5, TransactionClassifier),
449+
(0, HeaderMetadataClassifier),
450+
(6, DefaultMetadataClassifier),
451+
]
452+
)
453+
order = registry.get_priority_order()
454+
assert order[0] == (0, "HeaderMetadataClassifier")
455+
assert order[1] == (5, "TransactionClassifier")
456+
assert order[2] == (6, "DefaultMetadataClassifier")
457+
458+
@pytest.mark.parametrize(
459+
"row,expected,reason",
460+
[
461+
(
462+
{"Date": "date", "Details": "Purchase", "Debit €": "50.00"},
463+
"metadata",
464+
"HeaderMetadata (0) beats Transaction (5) for header-like date value",
465+
),
466+
(
467+
{
468+
"Date": "",
469+
"Details": "BALANCE FORWARD",
470+
"Debit €": "",
471+
"Credit €": "",
472+
"Balance €": "",
473+
},
474+
"administrative",
475+
"Administrative (1) beats Transaction (5) for BALANCE FORWARD with no balance",
476+
),
477+
(
478+
{"Date": "", "Details": "0.828571", "Debit €": "", "Credit €": ""},
479+
"continuation",
480+
"FXContinuation (3) beats Transaction (5) for exchange-rate-only rows",
481+
),
482+
],
483+
)
484+
def test_ambiguous_row_priority(self, row, expected, reason):
485+
"""Ambiguous rows resolve to the highest-priority (lowest number) classifier."""
486+
chain = create_row_classifier_chain()
487+
assert chain.classify(row, TEST_COLUMNS) == expected, reason
488+
489+
def test_wrong_order_produces_wrong_result(self):
490+
"""Documents that priority order is not arbitrary — regression guard."""
491+
wrong_order_chain = ClassifierRegistry(
492+
[
493+
(0, TransactionClassifier),
494+
(1, HeaderMetadataClassifier),
495+
(2, AdministrativeClassifier),
496+
(3, ReferenceCodeClassifier),
497+
(4, FXContinuationClassifier),
498+
(5, TimestampMetadataClassifier),
499+
(6, DefaultMetadataClassifier),
500+
]
501+
).build_chain()
502+
503+
ambiguous = {"Date": "date", "Details": "Purchase", "Debit €": "50.00"}
504+
# With Transaction first, it wins over Header
505+
assert wrong_order_chain.classify(ambiguous, TEST_COLUMNS) == "transaction"
506+
# Confirms the correct chain must put HeaderMetadata first
507+
correct_chain = create_row_classifier_chain()
508+
assert correct_chain.classify(ambiguous, TEST_COLUMNS) == "metadata"

0 commit comments

Comments
 (0)