1313 RowClassifier ,
1414 create_row_classifier_chain ,
1515)
16+ from bankstatements_core .extraction .word_utils import (
17+ assign_words_to_columns ,
18+ calculate_column_coverage ,
19+ group_words_by_y ,
20+ )
1621
1722logger = logging .getLogger (__name__ )
1823
@@ -97,7 +102,8 @@ def detect_boundary(self, words: list[dict]) -> int:
97102 return self .fallback_bottom_y
98103
99104 # Phase 0: Group words and find transaction positions
100- lines = self ._group_words_by_y (words )
105+ filtered_words = [w for w in words if w ["top" ] >= self .table_top_y ]
106+ lines = group_words_by_y (filtered_words )
101107 if not lines :
102108 return self .fallback_bottom_y
103109
@@ -156,23 +162,6 @@ def detect_boundary(self, words: list[dict]) -> int:
156162 logger .debug ("No clear table end detected - using fallback boundary" )
157163 return self .fallback_bottom_y
158164
159- def _group_words_by_y (self , words : list [dict ]) -> dict [float , list [dict ]]:
160- """
161- Group words by Y-coordinate (rounded).
162-
163- Args:
164- words: List of word dictionaries with 'top', 'x0', 'text' keys
165-
166- Returns:
167- Dictionary mapping Y-coordinate to list of words at that Y
168- """
169- lines : dict [float , list [dict ]] = {}
170- for w in words :
171- if w ["top" ] >= self .table_top_y :
172- y_key = round (w ["top" ], 0 )
173- lines .setdefault (y_key , []).append (w )
174- return lines
175-
176165 def _find_transaction_positions (
177166 self , lines : dict [float , list [dict ]], sorted_y_coords : list [float ]
178167 ) -> tuple [list [float ], float | None ]:
@@ -190,7 +179,7 @@ def _find_transaction_positions(
190179 last_transaction_y = None
191180
192181 for y_coord in sorted_y_coords :
193- row = self . _build_row_from_words (lines [y_coord ])
182+ row = assign_words_to_columns (lines [y_coord ], self . columns )
194183
195184 if any (row .values ()):
196185 row_type = self ._row_classifier .classify (row , self .columns )
@@ -278,7 +267,7 @@ def _detect_by_spatial_gaps(
278267 post_gap_transactions = 0
279268
280269 for y_coord in post_gap_y_coords :
281- row = self . _build_row_from_words (lines [y_coord ])
270+ row = assign_words_to_columns (lines [y_coord ], self . columns )
282271
283272 if (
284273 any (row .values ())
@@ -320,11 +309,11 @@ def _detect_by_structure_breakdown(
320309 if last_transaction_y is not None and y_coord <= last_transaction_y :
321310 continue
322311
323- row = self . _build_row_from_words (lines [y_coord ])
312+ row = assign_words_to_columns (lines [y_coord ], self . columns )
324313
325314 if any (row .values ()):
326315 # Check if this row has any structure (data in expected columns)
327- column_coverage = self . _calculate_column_coverage ([row ])
316+ column_coverage = calculate_column_coverage ([row ], self . columns )
328317 if column_coverage < 0.3 : # Less than 30% of columns have data
329318 structure_breakdown_count += 1
330319 else :
@@ -369,7 +358,7 @@ def _detect_by_consecutive_non_transactions(
369358 if last_transaction_y is not None and y_coord <= last_transaction_y :
370359 continue
371360
372- row = self . _build_row_from_words (lines [y_coord ])
361+ row = assign_words_to_columns (lines [y_coord ], self . columns )
373362
374363 if any (row .values ()):
375364 row_type = self ._row_classifier .classify (row , self .columns )
@@ -391,48 +380,3 @@ def _detect_by_consecutive_non_transactions(
391380 )
392381
393382 return None
394-
395- def _build_row_from_words (self , words : list [dict ]) -> dict [str , str ]:
396- """
397- Build a row dictionary from words by assigning to columns.
398-
399- Args:
400- words: List of words at the same Y-coordinate
401-
402- Returns:
403- Dictionary mapping column names to concatenated text
404- """
405- row = dict .fromkeys (self .columns , "" )
406-
407- for w in words :
408- x0 = w ["x0" ]
409- text = w ["text" ]
410- for col , (xmin , xmax ) in self .columns .items ():
411- if xmin <= x0 < xmax :
412- row [col ] += text + " "
413- break
414-
415- return {k : v .strip () for k , v in row .items ()}
416-
417- def _calculate_column_coverage (self , rows : list [dict [str , str ]]) -> float :
418- """
419- Calculate what percentage of columns have data in the given rows.
420-
421- Args:
422- rows: List of row dictionaries
423-
424- Returns:
425- Float between 0.0 and 1.0 representing column coverage
426- """
427- if not rows :
428- return 0.0
429-
430- total_columns = len (self .columns )
431- columns_with_data = set ()
432-
433- for row in rows :
434- for col_name , value in row .items ():
435- if value and value .strip ():
436- columns_with_data .add (col_name )
437-
438- return len (columns_with_data ) / total_columns if total_columns > 0 else 0.0
0 commit comments