guess_punct_space: remove whitespace before punct

lopuhin · lopuhin · commit e8333574d19f · 2017-05-29T12:34:18.000+03:00
This is similar to webstruct.utils.smart_joins (https://github.com/scrapinghub/webstruct/blob/5a3f39e2ec78a04ca021a12dff58f66686d86251/webstruct/utils.py#L61), but is applied only on the tag boundaries. This mode is just a little bit slower than default.
diff --git a/html_text/html_text.py b/html_text/html_text.py
@@ -41,18 +41,36 @@ def parse_html(html):
     return lxml.html.fromstring(html.encode('utf8'), parser=parser)
 
 
-_whitespace = re.compile('\s+')
+_whitespace = re.compile(r'\s+')
+_trailing_whitespace = re.compile(r'\s$')
+_punct_after = re.compile(r'[,:;.!?"\)]')
+_punct_before = re.compile(r'[\(]')
 
 
-def selector_to_text(sel):
+def selector_to_text(sel, guess_punct_space=False):
     """ Convert a cleaned selector to text.
     Almost the same as xpath normalize-space, but this also
     adds spaces between inline elements (like <span>) which are
     often used as block elements in html markup.
     """
-    fragments = (_whitespace.sub(' ', x.strip())
-                 for x in sel.xpath('//text()').extract())
-    return ' '.join(x for x in fragments if x)
+    if guess_punct_space:
+
+        def fragments():
+            prev = None
+            for text in sel.xpath('//text()').extract():
+                if prev is not None and (_trailing_whitespace.search(prev)
+                                         or (not _punct_after.match(text) and
+                                             not _punct_before.match(prev))):
+                    yield ' '
+                yield text
+                prev = text
+
+        return _whitespace.sub(' ', ''.join(fragments()).strip())
+
+    else:
+        fragments = (_whitespace.sub(' ', x.strip())
+                     for x in sel.xpath('//text()').extract())
+        return ' '.join(x for x in fragments if x)
 
 
 def cleaned_selector(html):
@@ -70,10 +88,11 @@ def cleaned_selector(html):
     return sel
 
 
-def extract_text(html, encoding='utf8'):
+def extract_text(html, guess_punct_space=False):
     """
     Convert html to text.
 
     html should be a unicode string or an already parsed lxml.html element.
     """
-    return selector_to_text(cleaned_selector(html))
+    sel = cleaned_selector(html)
+    return selector_to_text(sel, guess_punct_space=guess_punct_space)
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
@@ -1,30 +1,49 @@
 # -*- coding: utf-8 -*-
+import pytest
 
 from html_text import extract_text, parse_html
 
 
-def test_extract_text():
+@pytest.fixture(params=[{'guess_punct_space': True},
+                        {'guess_punct_space': False}])
+def all_options(request):
+    return request.param
+
+
+def test_extract_text(all_options):
     html = u'<html><style>.div {}</style><body><p>Hello,   world!</body></html>'
-    assert extract_text(html) == u'Hello, world!'
+    assert extract_text(html, **all_options) == u'Hello, world!'
 
 
-def test_declared_encoding():
+def test_declared_encoding(all_options):
     html = (u'<?xml version="1.0" encoding="utf-8" ?>'
             u'<html><style>.div {}</style>'
             u'<body>Hello,   world!</p></body></html>')
-    assert extract_text(html) == u'Hello, world!'
+    assert extract_text(html, **all_options) == u'Hello, world!'
 
 
-def test_empty():
-    assert extract_text(u'') == ''
+def test_empty(all_options):
+    assert extract_text(u'', **all_options) == ''
 
 
-def test_extract_text_from_tree():
+def test_extract_text_from_tree(all_options):
     html = u'<html><style>.div {}</style><body><p>Hello,   world!</body></html>'
     tree = parse_html(html)
-    assert extract_text(tree) == u'Hello, world!'
+    assert extract_text(tree, **all_options) == u'Hello, world!'
+
+
+def test_inline_tags_whitespace(all_options):
+    html = u'<span>field</span><span>value  of</span><span></span>'
+    assert extract_text(html, **all_options) == u'field value of'
+
+
+def test_punct_whitespace():
+    html = u'<div><span>field</span>, and more</div>'
+    assert extract_text(html) == u'field , and more'
 
 
-def test_inline_tags_whitespace():
-    html = u'<span>field</span><span>value</span>'
-    assert extract_text(html) == u'field value'
+def test_punct_whitespace_preserved():
+    html = (u'<div><span>по</span><span>ле</span>, and  ,  '
+            u'<span>more </span>!<span>now</div>(<b>boo</b>)')
+    assert (extract_text(html, guess_punct_space=True) ==
+            u'по ле, and , more ! now (boo)')