ignore digit related sentences

common-voice · Feb 26, 2019 · 85216e5 · 85216e5
1 parent 7f37c78
commit 85216e5
Show file tree

Hide file tree

Showing 4 changed files with 5 additions and 36 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -29,7 +29,6 @@ setup_requires = pyscaffold>=3.1a0,<3.2a0
 # Add here dependencies of your project (semicolon/line-separated), e.g.
 install_requires = pandas
                    swifter
-                   num2words==0.5.9
 # The usage of test_requires is discouraged, see `Dependency Management` docs
 # tests_require = pytest; pytest-cov
 # Require a specific Python version, e.g. Python 2.7 or >= 3.4

diff --git a/src/corporacreator/preprocessors/fr.py b/src/corporacreator/preprocessors/fr.py
@@ -1,8 +1,7 @@
 import re
 
-from corporacreator.utils import maybe_normalize, replace_numbers, FIND_MULTIPLE_SPACES_REG
+from corporacreator.utils import maybe_normalize, FIND_MULTIPLE_SPACES_REG
 
-FIND_ORDINAL_REG = re.compile(r"(\d+)([ème|éme|ieme|ier|iere]+)")
 
 SPELLED_ACRONYMS = {
     'ANPE',
@@ -68,7 +67,6 @@ def fr(client_id, sentence):
       (str): Cleaned up sentence. Returning None or a `str` of whitespace flags the sentence as invalid.
     """
     text = maybe_normalize(sentence, mapping=FR_NORMALIZATIONS + [REPLACE_SPELLED_ACRONYMS])
-    text = replace_numbers(text, locale='fr', ordinal_regex=FIND_ORDINAL_REG)
     # TODO: restore this once we are clear on which punctuation marks should be kept or removed
     # text = FIND_PUNCTUATIONS_REG.sub(' ', text)
     text = FIND_MULTIPLE_SPACES_REG.sub(' ', text)

diff --git a/src/corporacreator/utils.py b/src/corporacreator/utils.py
@@ -4,41 +4,10 @@
 from num2words import num2words
 
 
-NUMS_REGEX = re.compile(r"(\d+,?\u00A0?\d+)|(\d+\w+)|(\d)+")
 FIND_MULTIPLE_SPACES_REG = re.compile(r'\s{2,}')
 FIND_PUNCTUATIONS_REG = re.compile(r"[/°\-,;!?.()\[\]*…—«»]")
 
 
-def get_numbers(text):
-    return NUMS_REGEX.split(text)
-
-
-def replace_numbers(inp: str, locale: str, ordinal_regex: Pattern = None):
-    finalinp = ''
-    for e in get_numbers(inp):
-        if not e:
-            continue
-        newinp = e
-        try:
-            ee = ''.join(e.split())
-            if int(e) >= 0:
-                newinp = num2words(int(ee), lang=locale)
-        except ValueError:
-            try:
-                ee = ''.join(e.replace(',', '.').split())
-                if float(ee):
-                    newinp = num2words(float(ee), lang=locale)
-            except ValueError:
-                if ordinal_regex:
-                    matches = ordinal_regex.match(e)
-                    if matches:
-                        newinp = num2words(int(matches.group(1)), ordinal=True, lang=locale)
-
-        finalinp += newinp
-
-    return finalinp
-
-
 def maybe_normalize(value: str, mapping):
     for norm in mapping:
         if isinstance(norm[0], str):

diff --git a/tests/test_preprocessors.py b/tests/test_preprocessors.py
@@ -31,4 +31,7 @@
 ])
 def test_preprocessor(locale, client_id, sentence, expected):
     preprocessor = getattr(preprocessors, locale.replace('-', ''))
-    assert expected == preprocessor(client_id, preprocessors.common(sentence))
+    is_valid, sentence = preprocessors.common(sentence)
+    if not is_valid:
+        pytest.skip('not supported right now (see https://github.com/mozilla/CorporaCreator/pull/87#issuecomment-466296310 for more info)')
+    assert expected == preprocessor(client_id, sentence)