Skip to content

Commit c76f1d4

Browse files
authored
Merge pull request #970 from jeanas/autojunk
Fix two issues with fuzzy matching
2 parents 08af5e2 + c8b7ac5 commit c76f1d4

File tree

2 files changed

+60
-12
lines changed

2 files changed

+60
-12
lines changed

babel/messages/catalog.py

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@
1414
from collections import OrderedDict
1515
from collections.abc import Iterable, Iterator
1616
from copy import copy
17-
from difflib import get_close_matches
17+
from difflib import SequenceMatcher
1818
from email import message_from_string
19+
from heapq import nlargest
1920
from typing import TYPE_CHECKING
2021

2122
from babel import __version__ as VERSION
@@ -31,6 +32,31 @@
3132

3233
__all__ = ['Message', 'Catalog', 'TranslationError']
3334

35+
def get_close_matches(word, possibilities, n=3, cutoff=0.6):
36+
"""A modified version of ``difflib.get_close_matches``.
37+
38+
It just passes ``autojunk=False`` to the ``SequenceMatcher``, to work
39+
around https://github.com/python/cpython/issues/90825.
40+
"""
41+
if not n > 0:
42+
raise ValueError("n must be > 0: %r" % (n,))
43+
if not 0.0 <= cutoff <= 1.0:
44+
raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
45+
result = []
46+
s = SequenceMatcher(autojunk=False) # only line changed from difflib.py
47+
s.set_seq2(word)
48+
for x in possibilities:
49+
s.set_seq1(x)
50+
if s.real_quick_ratio() >= cutoff and \
51+
s.quick_ratio() >= cutoff and \
52+
s.ratio() >= cutoff:
53+
result.append((s.ratio(), x))
54+
55+
# Move the best scorers to head of list
56+
result = nlargest(n, result)
57+
# Strip scores for the best n matches
58+
return [x for score, x in result]
59+
3460

3561
PYTHON_FORMAT = re.compile(r'''
3662
\%
@@ -803,10 +829,13 @@ def update(
803829
# Prepare for fuzzy matching
804830
fuzzy_candidates = []
805831
if not no_fuzzy_matching:
806-
fuzzy_candidates = {
807-
self._key_for(msgid): messages[msgid].context
808-
for msgid in messages if msgid and messages[msgid].string
809-
}
832+
fuzzy_candidates = {}
833+
for msgid in messages:
834+
if msgid and messages[msgid].string:
835+
key = self._key_for(msgid)
836+
ctxt = messages[msgid].context
837+
modified_key = key.lower().strip()
838+
fuzzy_candidates[modified_key] = (key, ctxt)
810839
fuzzy_matches = set()
811840

812841
def _merge(message: Message, oldkey: tuple[str, str] | str, newkey: tuple[str, str] | str) -> None:
@@ -861,8 +890,8 @@ def _merge(message: Message, oldkey: tuple[str, str] | str, newkey: tuple[str, s
861890
matches = get_close_matches(matchkey.lower().strip(),
862891
fuzzy_candidates.keys(), 1)
863892
if matches:
864-
newkey = matches[0]
865-
newctxt = fuzzy_candidates[newkey]
893+
modified_key = matches[0]
894+
newkey, newctxt = fuzzy_candidates[modified_key]
866895
if newctxt is not None:
867896
newkey = newkey, newctxt
868897
_merge(message, newkey, key)

tests/messages/test_catalog.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -121,16 +121,16 @@ def test_update_message_updates_comments(self):
121121

122122
def test_update_fuzzy_matching_with_case_change(self):
123123
cat = catalog.Catalog()
124-
cat.add('foo', 'Voh')
124+
cat.add('FOO', 'Voh')
125125
cat.add('bar', 'Bahr')
126126
tmpl = catalog.Catalog()
127-
tmpl.add('Foo')
127+
tmpl.add('foo')
128128
cat.update(tmpl)
129129
assert len(cat.obsolete) == 1
130-
assert 'foo' not in cat
130+
assert 'FOO' not in cat
131131

132-
assert cat['Foo'].string == 'Voh'
133-
assert cat['Foo'].fuzzy is True
132+
assert cat['foo'].string == 'Voh'
133+
assert cat['foo'].fuzzy is True
134134

135135
def test_update_fuzzy_matching_with_char_change(self):
136136
cat = catalog.Catalog()
@@ -209,6 +209,25 @@ def test_update_fuzzy_matching_no_cascading(self):
209209
assert cat['fooo'].string == 'Vohe'
210210
assert cat['fooo'].fuzzy is True
211211

212+
def test_update_fuzzy_matching_long_string(self):
213+
lipsum = "\
214+
Lorem Ipsum is simply dummy text of the printing and typesetting \
215+
industry. Lorem Ipsum has been the industry's standard dummy text ever \
216+
since the 1500s, when an unknown printer took a galley of type and \
217+
scrambled it to make a type specimen book. It has survived not only \
218+
five centuries, but also the leap into electronic typesetting, \
219+
remaining essentially unchanged. It was popularised in the 1960s with \
220+
the release of Letraset sheets containing Lorem Ipsum passages, and \
221+
more recently with desktop publishing software like Aldus PageMaker \
222+
including versions of Lorem Ipsum."
223+
cat = catalog.Catalog()
224+
cat.add("ZZZZZZ " + lipsum, "foo")
225+
tmpl = catalog.Catalog()
226+
tmpl.add(lipsum + " ZZZZZZ")
227+
cat.update(tmpl)
228+
assert cat[lipsum + " ZZZZZZ"].fuzzy is True
229+
assert len(cat.obsolete) == 0
230+
212231
def test_update_without_fuzzy_matching(self):
213232
cat = catalog.Catalog()
214233
cat.add('fo', 'Voh')

0 commit comments

Comments
 (0)