Skip to content

Commit e833357

Browse files
committed
guess_punct_space: remove whitespace before punct
This is similar to webstruct.utils.smart_joins (https://github.com/scrapinghub/webstruct/blob/5a3f39e2ec78a04ca021a12dff58f66686d86251/webstruct/utils.py#L61), but is applied only on the tag boundaries. This mode is just a little bit slower than default.
1 parent 43f1bd4 commit e833357

File tree

2 files changed

+56
-18
lines changed

2 files changed

+56
-18
lines changed

html_text/html_text.py

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,18 +41,36 @@ def parse_html(html):
4141
return lxml.html.fromstring(html.encode('utf8'), parser=parser)
4242

4343

44-
_whitespace = re.compile('\s+')
44+
_whitespace = re.compile(r'\s+')
45+
_trailing_whitespace = re.compile(r'\s$')
46+
_punct_after = re.compile(r'[,:;.!?"\)]')
47+
_punct_before = re.compile(r'[\(]')
4548

4649

47-
def selector_to_text(sel):
50+
def selector_to_text(sel, guess_punct_space=False):
4851
""" Convert a cleaned selector to text.
4952
Almost the same as xpath normalize-space, but this also
5053
adds spaces between inline elements (like <span>) which are
5154
often used as block elements in html markup.
5255
"""
53-
fragments = (_whitespace.sub(' ', x.strip())
54-
for x in sel.xpath('//text()').extract())
55-
return ' '.join(x for x in fragments if x)
56+
if guess_punct_space:
57+
58+
def fragments():
59+
prev = None
60+
for text in sel.xpath('//text()').extract():
61+
if prev is not None and (_trailing_whitespace.search(prev)
62+
or (not _punct_after.match(text) and
63+
not _punct_before.match(prev))):
64+
yield ' '
65+
yield text
66+
prev = text
67+
68+
return _whitespace.sub(' ', ''.join(fragments()).strip())
69+
70+
else:
71+
fragments = (_whitespace.sub(' ', x.strip())
72+
for x in sel.xpath('//text()').extract())
73+
return ' '.join(x for x in fragments if x)
5674

5775

5876
def cleaned_selector(html):
@@ -70,10 +88,11 @@ def cleaned_selector(html):
7088
return sel
7189

7290

73-
def extract_text(html, encoding='utf8'):
91+
def extract_text(html, guess_punct_space=False):
7492
"""
7593
Convert html to text.
7694
7795
html should be a unicode string or an already parsed lxml.html element.
7896
"""
79-
return selector_to_text(cleaned_selector(html))
97+
sel = cleaned_selector(html)
98+
return selector_to_text(sel, guess_punct_space=guess_punct_space)

tests/test_html_text.py

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,49 @@
11
# -*- coding: utf-8 -*-
2+
import pytest
23

34
from html_text import extract_text, parse_html
45

56

6-
def test_extract_text():
7+
@pytest.fixture(params=[{'guess_punct_space': True},
8+
{'guess_punct_space': False}])
9+
def all_options(request):
10+
return request.param
11+
12+
13+
def test_extract_text(all_options):
714
html = u'<html><style>.div {}</style><body><p>Hello, world!</body></html>'
8-
assert extract_text(html) == u'Hello, world!'
15+
assert extract_text(html, **all_options) == u'Hello, world!'
916

1017

11-
def test_declared_encoding():
18+
def test_declared_encoding(all_options):
1219
html = (u'<?xml version="1.0" encoding="utf-8" ?>'
1320
u'<html><style>.div {}</style>'
1421
u'<body>Hello, world!</p></body></html>')
15-
assert extract_text(html) == u'Hello, world!'
22+
assert extract_text(html, **all_options) == u'Hello, world!'
1623

1724

18-
def test_empty():
19-
assert extract_text(u'') == ''
25+
def test_empty(all_options):
26+
assert extract_text(u'', **all_options) == ''
2027

2128

22-
def test_extract_text_from_tree():
29+
def test_extract_text_from_tree(all_options):
2330
html = u'<html><style>.div {}</style><body><p>Hello, world!</body></html>'
2431
tree = parse_html(html)
25-
assert extract_text(tree) == u'Hello, world!'
32+
assert extract_text(tree, **all_options) == u'Hello, world!'
33+
34+
35+
def test_inline_tags_whitespace(all_options):
36+
html = u'<span>field</span><span>value of</span><span></span>'
37+
assert extract_text(html, **all_options) == u'field value of'
38+
39+
40+
def test_punct_whitespace():
41+
html = u'<div><span>field</span>, and more</div>'
42+
assert extract_text(html) == u'field , and more'
2643

2744

28-
def test_inline_tags_whitespace():
29-
html = u'<span>field</span><span>value</span>'
30-
assert extract_text(html) == u'field value'
45+
def test_punct_whitespace_preserved():
46+
html = (u'<div><span>по</span><span>ле</span>, and , '
47+
u'<span>more </span>!<span>now</div>(<b>boo</b>)')
48+
assert (extract_text(html, guess_punct_space=True) ==
49+
u'по ле, and , more ! now (boo)')

0 commit comments

Comments
 (0)