Skip to content

Commit

Permalink
ignore digit related sentences
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolaspanel committed Feb 26, 2019
1 parent 7f37c78 commit 85216e5
Show file tree
Hide file tree
Showing 4 changed files with 5 additions and 36 deletions.
1 change: 0 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ setup_requires = pyscaffold>=3.1a0,<3.2a0
# Add here dependencies of your project (semicolon/line-separated), e.g.
install_requires = pandas
swifter
num2words==0.5.9
# The usage of test_requires is discouraged, see `Dependency Management` docs
# tests_require = pytest; pytest-cov
# Require a specific Python version, e.g. Python 2.7 or >= 3.4
Expand Down
4 changes: 1 addition & 3 deletions src/corporacreator/preprocessors/fr.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import re

from corporacreator.utils import maybe_normalize, replace_numbers, FIND_MULTIPLE_SPACES_REG
from corporacreator.utils import maybe_normalize, FIND_MULTIPLE_SPACES_REG

FIND_ORDINAL_REG = re.compile(r"(\d+)([ème|éme|ieme|ier|iere]+)")

SPELLED_ACRONYMS = {
'ANPE',
Expand Down Expand Up @@ -68,7 +67,6 @@ def fr(client_id, sentence):
(str): Cleaned up sentence. Returning None or a `str` of whitespace flags the sentence as invalid.
"""
text = maybe_normalize(sentence, mapping=FR_NORMALIZATIONS + [REPLACE_SPELLED_ACRONYMS])
text = replace_numbers(text, locale='fr', ordinal_regex=FIND_ORDINAL_REG)
# TODO: restore this once we are clear on which punctuation marks should be kept or removed
# text = FIND_PUNCTUATIONS_REG.sub(' ', text)
text = FIND_MULTIPLE_SPACES_REG.sub(' ', text)
Expand Down
31 changes: 0 additions & 31 deletions src/corporacreator/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,41 +4,10 @@
from num2words import num2words


NUMS_REGEX = re.compile(r"(\d+,?\u00A0?\d+)|(\d+\w+)|(\d)+")
FIND_MULTIPLE_SPACES_REG = re.compile(r'\s{2,}')
FIND_PUNCTUATIONS_REG = re.compile(r"[/°\-,;!?.()\[\]*…—«»]")


def get_numbers(text):
return NUMS_REGEX.split(text)


def replace_numbers(inp: str, locale: str, ordinal_regex: Pattern = None):
finalinp = ''
for e in get_numbers(inp):
if not e:
continue
newinp = e
try:
ee = ''.join(e.split())
if int(e) >= 0:
newinp = num2words(int(ee), lang=locale)
except ValueError:
try:
ee = ''.join(e.replace(',', '.').split())
if float(ee):
newinp = num2words(float(ee), lang=locale)
except ValueError:
if ordinal_regex:
matches = ordinal_regex.match(e)
if matches:
newinp = num2words(int(matches.group(1)), ordinal=True, lang=locale)

finalinp += newinp

return finalinp


def maybe_normalize(value: str, mapping):
for norm in mapping:
if isinstance(norm[0], str):
Expand Down
5 changes: 4 additions & 1 deletion tests/test_preprocessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,7 @@
])
def test_preprocessor(locale, client_id, sentence, expected):
preprocessor = getattr(preprocessors, locale.replace('-', ''))
assert expected == preprocessor(client_id, preprocessors.common(sentence))
is_valid, sentence = preprocessors.common(sentence)
if not is_valid:
pytest.skip('not supported right now (see https://github.com/mozilla/CorporaCreator/pull/87#issuecomment-466296310 for more info)')
assert expected == preprocessor(client_id, sentence)

0 comments on commit 85216e5

Please sign in to comment.