diff --git a/doctr/models/recognition/utils.py b/doctr/models/recognition/utils.py index 35c80d5ddb..a5070ad68b 100644 --- a/doctr/models/recognition/utils.py +++ b/doctr/models/recognition/utils.py @@ -28,45 +28,54 @@ def merge_strings(a: str, b: str, overlap_ratio: float) -> str: 'abcdefgh' """ seq_len = min(len(a), len(b)) - if seq_len <= 1: # One sequence is empty or will be after cropping in next step, return both to keep data + if seq_len <= 1: return a + b - a_crop, b_crop = a[:-1], b[1:] # Remove last letter of "a" and first of "b", because they might be cut off + a_crop = a[:-1] + b_crop = b[1:] max_overlap = min(len(a_crop), len(b_crop)) - # Compute Hamming distances for all possible overlaps - scores = [Hamming.distance(a_crop[-i:], b_crop[:i], processor=None) for i in range(1, max_overlap + 1)] + # Precompute expected_overlap outside of branching: + expected_overlap = round(len(b) * overlap_ratio) - 3 - # Find zero-score matches - zero_matches = [i for i, score in enumerate(scores) if score == 0] + # Preallocate scores and zero_matches, single loop + scores = [] + zero_matches = [] + for i in range(1, max_overlap + 1): + score = Hamming.distance(a_crop[-i:], b_crop[:i], processor=None) + scores.append(score) + if score == 0: + zero_matches.append(i - 1) - expected_overlap = round(len(b) * overlap_ratio) - 3 # adjust for cropping and index - - # Case 1: One perfect match - exactly one zero score - just merge there if len(zero_matches) == 1: i = zero_matches[0] return a_crop + b_crop[i + 1 :] - # Case 2: Multiple perfect matches - likely due to repeated characters. - # Use the estimated overlap length to choose the match closest to the expected alignment. elif len(zero_matches) > 1: + # Use generator to avoid list allocation inside min() best_i = min(zero_matches, key=lambda x: abs(x - expected_overlap)) return a_crop + b_crop[best_i + 1 :] - # Case 3: Absence of zero scores indicates that the same character in the image was recognized differently OR that - # the overlap was too small and we just need to merge the crops fully if expected_overlap < -1: return a + b elif expected_overlap < 0: return a_crop + b_crop - # Find best overlap by minimizing Hamming distance + distance from expected overlap size - combined_scores = [score + abs(i - expected_overlap) for i, score in enumerate(scores)] - best_i = combined_scores.index(min(combined_scores)) - return a_crop + b_crop[best_i + 1 :] + # Avoid enumerating twice by combining in one pass + min_score = None + min_idx = -1 + for i, score in enumerate(scores): + combined = score + abs(i - expected_overlap) + if (min_score is None) or (combined < min_score): + min_score = combined + min_idx = i + + return a_crop + b_crop[min_idx + 1 :] -def merge_multi_strings(seq_list: list[str], overlap_ratio: float, last_overlap_ratio: float) -> str: +def merge_multi_strings( + seq_list: list[str], overlap_ratio: float, last_overlap_ratio: float +) -> str: """ Merges consecutive string sequences with overlapping characters.