Skip to content

Commit 09589e8

Browse files
authored
Merge pull request #36 from longieirl/feat/32-scoring-config-injectable
feat(#32): make ScoringConfig injectable in TemplateDetector
2 parents f1e9ef1 + 60df9ad commit 09589e8

27 files changed

Lines changed: 1474 additions & 557 deletions

packages/parser-core/src/bankstatements_core/extraction/boundary_detector.py

Lines changed: 12 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@
1313
RowClassifier,
1414
create_row_classifier_chain,
1515
)
16+
from bankstatements_core.extraction.word_utils import (
17+
assign_words_to_columns,
18+
calculate_column_coverage,
19+
group_words_by_y,
20+
)
1621

1722
logger = logging.getLogger(__name__)
1823

@@ -97,7 +102,8 @@ def detect_boundary(self, words: list[dict]) -> int:
97102
return self.fallback_bottom_y
98103

99104
# Phase 0: Group words and find transaction positions
100-
lines = self._group_words_by_y(words)
105+
filtered_words = [w for w in words if w["top"] >= self.table_top_y]
106+
lines = group_words_by_y(filtered_words)
101107
if not lines:
102108
return self.fallback_bottom_y
103109

@@ -156,23 +162,6 @@ def detect_boundary(self, words: list[dict]) -> int:
156162
logger.debug("No clear table end detected - using fallback boundary")
157163
return self.fallback_bottom_y
158164

159-
def _group_words_by_y(self, words: list[dict]) -> dict[float, list[dict]]:
160-
"""
161-
Group words by Y-coordinate (rounded).
162-
163-
Args:
164-
words: List of word dictionaries with 'top', 'x0', 'text' keys
165-
166-
Returns:
167-
Dictionary mapping Y-coordinate to list of words at that Y
168-
"""
169-
lines: dict[float, list[dict]] = {}
170-
for w in words:
171-
if w["top"] >= self.table_top_y:
172-
y_key = round(w["top"], 0)
173-
lines.setdefault(y_key, []).append(w)
174-
return lines
175-
176165
def _find_transaction_positions(
177166
self, lines: dict[float, list[dict]], sorted_y_coords: list[float]
178167
) -> tuple[list[float], float | None]:
@@ -190,7 +179,7 @@ def _find_transaction_positions(
190179
last_transaction_y = None
191180

192181
for y_coord in sorted_y_coords:
193-
row = self._build_row_from_words(lines[y_coord])
182+
row = assign_words_to_columns(lines[y_coord], self.columns)
194183

195184
if any(row.values()):
196185
row_type = self._row_classifier.classify(row, self.columns)
@@ -278,7 +267,7 @@ def _detect_by_spatial_gaps(
278267
post_gap_transactions = 0
279268

280269
for y_coord in post_gap_y_coords:
281-
row = self._build_row_from_words(lines[y_coord])
270+
row = assign_words_to_columns(lines[y_coord], self.columns)
282271

283272
if (
284273
any(row.values())
@@ -320,11 +309,11 @@ def _detect_by_structure_breakdown(
320309
if last_transaction_y is not None and y_coord <= last_transaction_y:
321310
continue
322311

323-
row = self._build_row_from_words(lines[y_coord])
312+
row = assign_words_to_columns(lines[y_coord], self.columns)
324313

325314
if any(row.values()):
326315
# Check if this row has any structure (data in expected columns)
327-
column_coverage = self._calculate_column_coverage([row])
316+
column_coverage = calculate_column_coverage([row], self.columns)
328317
if column_coverage < 0.3: # Less than 30% of columns have data
329318
structure_breakdown_count += 1
330319
else:
@@ -369,7 +358,7 @@ def _detect_by_consecutive_non_transactions(
369358
if last_transaction_y is not None and y_coord <= last_transaction_y:
370359
continue
371360

372-
row = self._build_row_from_words(lines[y_coord])
361+
row = assign_words_to_columns(lines[y_coord], self.columns)
373362

374363
if any(row.values()):
375364
row_type = self._row_classifier.classify(row, self.columns)
@@ -391,48 +380,3 @@ def _detect_by_consecutive_non_transactions(
391380
)
392381

393382
return None
394-
395-
def _build_row_from_words(self, words: list[dict]) -> dict[str, str]:
396-
"""
397-
Build a row dictionary from words by assigning to columns.
398-
399-
Args:
400-
words: List of words at the same Y-coordinate
401-
402-
Returns:
403-
Dictionary mapping column names to concatenated text
404-
"""
405-
row = dict.fromkeys(self.columns, "")
406-
407-
for w in words:
408-
x0 = w["x0"]
409-
text = w["text"]
410-
for col, (xmin, xmax) in self.columns.items():
411-
if xmin <= x0 < xmax:
412-
row[col] += text + " "
413-
break
414-
415-
return {k: v.strip() for k, v in row.items()}
416-
417-
def _calculate_column_coverage(self, rows: list[dict[str, str]]) -> float:
418-
"""
419-
Calculate what percentage of columns have data in the given rows.
420-
421-
Args:
422-
rows: List of row dictionaries
423-
424-
Returns:
425-
Float between 0.0 and 1.0 representing column coverage
426-
"""
427-
if not rows:
428-
return 0.0
429-
430-
total_columns = len(self.columns)
431-
columns_with_data = set()
432-
433-
for row in rows:
434-
for col_name, value in row.items():
435-
if value and value.strip():
436-
columns_with_data.add(col_name)
437-
438-
return len(columns_with_data) / total_columns if total_columns > 0 else 0.0

packages/parser-core/src/bankstatements_core/extraction/extraction_facade.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from typing import TYPE_CHECKING
1212

1313
from bankstatements_core.config.column_config import DEFAULT_COLUMNS
14+
from bankstatements_core.domain import ExtractionResult
1415
from bankstatements_core.extraction.extraction_params import TABLE_BOTTOM_Y, TABLE_TOP_Y
1516

1617
if TYPE_CHECKING:
@@ -70,7 +71,7 @@ def extract_tables_from_pdf(
7071
enable_page_validation: bool | None = None,
7172
enable_header_check: bool | None = None,
7273
template: "BankTemplate" | None = None,
73-
) -> tuple[list[dict], int, str | None]:
74+
) -> ExtractionResult:
7475
"""
7576
Extract table data from PDF within specified bounds (facade function).
7677
@@ -87,7 +88,8 @@ def extract_tables_from_pdf(
8788
template: Optional BankTemplate to use for extraction configuration
8889
8990
Returns:
90-
Tuple of (extracted rows, number of pages, IBAN if found)
91+
ExtractionResult containing extracted transactions, page count, IBAN,
92+
source file path, and any document-level warnings
9193
"""
9294
from bankstatements_core.extraction.pdf_extractor import PDFTableExtractor
9395

packages/parser-core/src/bankstatements_core/extraction/pdf_extractor.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
if TYPE_CHECKING:
1414
from bankstatements_core.domain.protocols.pdf_reader import IPDFReader
1515

16+
from bankstatements_core.domain import ExtractionResult
17+
from bankstatements_core.domain.converters import dicts_to_transactions
1618
from bankstatements_core.extraction.iban_extractor import IBANExtractor
1719
from bankstatements_core.extraction.page_header_analyser import PageHeaderAnalyser
1820
from bankstatements_core.extraction.row_builder import RowBuilder
@@ -72,14 +74,15 @@ def __init__(
7274
else:
7375
self._pdf_reader = pdf_reader
7476

75-
def extract(self, pdf_path: Path) -> tuple[list[dict], int, str | None]:
77+
def extract(self, pdf_path: Path) -> ExtractionResult:
7678
"""Extract table data from PDF file.
7779
7880
Args:
7981
pdf_path: Path to the PDF file
8082
8183
Returns:
82-
Tuple of (extracted rows, total page count, IBAN if found)
84+
ExtractionResult containing extracted transactions, page count,
85+
IBAN if found, source file path, and any document-level warnings
8386
"""
8487
rows: list[dict] = []
8588
iban = None
@@ -104,7 +107,13 @@ def extract(self, pdf_path: Path) -> tuple[list[dict], int, str | None]:
104107
f"Credit card statement detected in {pdf_path.name}. "
105108
f"Credit card statements are not currently supported. Skipping file."
106109
)
107-
return [], len(pdf.pages), None
110+
return ExtractionResult(
111+
transactions=[],
112+
page_count=len(pdf.pages),
113+
iban=None,
114+
source_file=pdf_path,
115+
warnings=["credit card statement detected, skipped"],
116+
)
108117

109118
if iban is None and page_num == 1:
110119
iban = self._header_analyser.extract_iban(page)
@@ -124,7 +133,12 @@ def extract(self, pdf_path: Path) -> tuple[list[dict], int, str | None]:
124133

125134
rows.extend(page_processor.process_page(page_rows))
126135

127-
return rows, len(pdf.pages), iban
136+
return ExtractionResult(
137+
transactions=dicts_to_transactions(rows),
138+
page_count=len(pdf.pages),
139+
iban=iban,
140+
source_file=pdf_path,
141+
)
128142

129143
def _extract_page(self, page: Any, page_num: int) -> list[dict] | None:
130144
"""Extract rows from a single page.

packages/parser-core/src/bankstatements_core/extraction/row_builder.py

Lines changed: 9 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@
99
import logging
1010
from typing import TYPE_CHECKING
1111

12+
from bankstatements_core.extraction.word_utils import (
13+
assign_words_to_columns,
14+
group_words_by_y,
15+
)
16+
1217
if TYPE_CHECKING:
1318
from bankstatements_core.extraction.row_classifiers import RowClassifier
1419

@@ -31,8 +36,6 @@ def __init__(
3136
) -> None:
3237
self._columns = columns
3338
self._row_classifier = row_classifier
34-
self._column_names = list(columns.keys())
35-
self._rightmost_column = self._column_names[-1] if self._column_names else None
3639

3740
def build_rows(self, words: list[dict]) -> list[dict]:
3841
"""Group words by Y position, assign to columns, return transaction/continuation rows.
@@ -43,35 +46,14 @@ def build_rows(self, words: list[dict]) -> list[dict]:
4346
Returns:
4447
List of row dictionaries classified as 'transaction' or 'continuation'
4548
"""
46-
lines: dict[float, list[dict]] = {}
47-
for w in words:
48-
y_key = round(w["top"], 0)
49-
lines.setdefault(y_key, []).append(w)
50-
49+
lines = group_words_by_y(words)
5150
page_rows = []
5251
for _, line_words in sorted(lines.items()):
53-
row = dict.fromkeys(self._columns, "")
54-
55-
for w in line_words:
56-
x0 = w["x0"]
57-
x1 = w.get("x1", x0 + max(len(w["text"]) * 3, 10))
58-
text = w["text"]
59-
60-
for col, (xmin, xmax) in self._columns.items():
61-
if col == self._rightmost_column:
62-
if xmin <= x0 and x1 <= xmax:
63-
row[col] += text + " "
64-
break
65-
else:
66-
if xmin <= x0 < xmax:
67-
row[col] += text + " "
68-
break
69-
70-
row = {k: v.strip() for k, v in row.items()}
71-
52+
row = assign_words_to_columns(
53+
line_words, self._columns, strict_rightmost=True
54+
)
7255
if any(row.values()):
7356
row_type = self._row_classifier.classify(row, self._columns)
7457
if row_type in ["transaction", "continuation"]:
7558
page_rows.append(row)
76-
7759
return page_rows

0 commit comments

Comments
 (0)