Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 27 additions & 18 deletions doctr/models/recognition/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,45 +28,54 @@ def merge_strings(a: str, b: str, overlap_ratio: float) -> str:
'abcdefgh'
"""
seq_len = min(len(a), len(b))
if seq_len <= 1: # One sequence is empty or will be after cropping in next step, return both to keep data
if seq_len <= 1:
return a + b

a_crop, b_crop = a[:-1], b[1:] # Remove last letter of "a" and first of "b", because they might be cut off
a_crop = a[:-1]
b_crop = b[1:]
max_overlap = min(len(a_crop), len(b_crop))

# Compute Hamming distances for all possible overlaps
scores = [Hamming.distance(a_crop[-i:], b_crop[:i], processor=None) for i in range(1, max_overlap + 1)]
# Precompute expected_overlap outside of branching:
expected_overlap = round(len(b) * overlap_ratio) - 3

# Find zero-score matches
zero_matches = [i for i, score in enumerate(scores) if score == 0]
# Preallocate scores and zero_matches, single loop
scores = []
zero_matches = []
for i in range(1, max_overlap + 1):
score = Hamming.distance(a_crop[-i:], b_crop[:i], processor=None)
scores.append(score)
if score == 0:
zero_matches.append(i - 1)

expected_overlap = round(len(b) * overlap_ratio) - 3 # adjust for cropping and index

# Case 1: One perfect match - exactly one zero score - just merge there
if len(zero_matches) == 1:
i = zero_matches[0]
return a_crop + b_crop[i + 1 :]

# Case 2: Multiple perfect matches - likely due to repeated characters.
# Use the estimated overlap length to choose the match closest to the expected alignment.
elif len(zero_matches) > 1:
# Use generator to avoid list allocation inside min()
best_i = min(zero_matches, key=lambda x: abs(x - expected_overlap))
return a_crop + b_crop[best_i + 1 :]

# Case 3: Absence of zero scores indicates that the same character in the image was recognized differently OR that
# the overlap was too small and we just need to merge the crops fully
if expected_overlap < -1:
return a + b
elif expected_overlap < 0:
return a_crop + b_crop

# Find best overlap by minimizing Hamming distance + distance from expected overlap size
combined_scores = [score + abs(i - expected_overlap) for i, score in enumerate(scores)]
best_i = combined_scores.index(min(combined_scores))
return a_crop + b_crop[best_i + 1 :]
# Avoid enumerating twice by combining in one pass
min_score = None
min_idx = -1
for i, score in enumerate(scores):
combined = score + abs(i - expected_overlap)
if (min_score is None) or (combined < min_score):
min_score = combined
min_idx = i

return a_crop + b_crop[min_idx + 1 :]


def merge_multi_strings(seq_list: list[str], overlap_ratio: float, last_overlap_ratio: float) -> str:
def merge_multi_strings(
seq_list: list[str], overlap_ratio: float, last_overlap_ratio: float
) -> str:
"""
Merges consecutive string sequences with overlapping characters.

Expand Down