Skip to content

Commit cf48523

Browse files
authored
Merge pull request #2 from TeamHG-Memex/inline-tags-spaces
Fix unwanted joins for inline tags
2 parents c7ebb57 + 1fb2ec4 commit cf48523

File tree

3 files changed

+73
-12
lines changed

3 files changed

+73
-12
lines changed

README.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ How is html_text different from ``.xpath('//text()')`` from LXML
2525
or ``.get_text()`` from Beautiful Soup?
2626
Text extracted with ``html_text`` does not contain inline styles,
2727
javascript, comments and other text that is not normally visible to the users.
28+
It normalizes whitespace, but is also smarter than ``.xpath('normalize-space())``,
29+
adding spaces around inline elements too
30+
(which are often used as block elements in html markup),
31+
and tries to avoid adding extra spaces for punctuation.
2832

2933
Apart from just getting text from the page (e.g. for display or search),
3034
one intended usage of this library is for machine learning (feature extraction).

html_text/html_text.py

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
# -*- coding: utf-8 -*-
2+
import re
3+
24
import lxml
35
import lxml.etree
46
from lxml.html.clean import Cleaner
@@ -39,10 +41,33 @@ def parse_html(html):
3941
return lxml.html.fromstring(html.encode('utf8'), parser=parser)
4042

4143

42-
def selector_to_text(sel):
44+
_whitespace = re.compile(r'\s+')
45+
_has_trailing_whitespace = re.compile(r'\s$').search
46+
_has_punct_after = re.compile(r'^[,:;.!?"\)]').search
47+
_has_punct_before = re.compile(r'\($').search
48+
49+
50+
def selector_to_text(sel, guess_punct_space=True):
4351
""" Convert a cleaned selector to text.
52+
See html_text.extract_text docstring for description of the approach and options.
4453
"""
45-
return sel.xpath('normalize-space()').extract_first('')
54+
if guess_punct_space:
55+
56+
def fragments():
57+
prev = None
58+
for text in sel.xpath('//text()').extract():
59+
if prev is not None and (_has_trailing_whitespace(prev)
60+
or (not _has_punct_after(text) and
61+
not _has_punct_before(prev))):
62+
yield ' '
63+
yield text
64+
prev = text
65+
66+
return _whitespace.sub(' ', ''.join(fragments()).strip())
67+
68+
else:
69+
fragments = (x.strip() for x in sel.xpath('//text()').extract())
70+
return _whitespace.sub(' ', ' '.join(x for x in fragments if x))
4671

4772

4873
def cleaned_selector(html):
@@ -60,10 +85,18 @@ def cleaned_selector(html):
6085
return sel
6186

6287

63-
def extract_text(html, encoding='utf8'):
88+
def extract_text(html, guess_punct_space=True):
6489
"""
6590
Convert html to text.
91+
Almost the same as normalize-space xpath, but this also
92+
adds spaces between inline elements (like <span>) which are
93+
often used as block elements in html markup.
94+
95+
When guess_punct_space is True (default), no extra whitespace is added
96+
for punctuation. This has a slight (around 10%) performance overhead
97+
and is just a heuristic.
6698
6799
html should be a unicode string or an already parsed lxml.html element.
68100
"""
69-
return selector_to_text(cleaned_selector(html))
101+
sel = cleaned_selector(html)
102+
return selector_to_text(sel, guess_punct_space=guess_punct_space)

tests/test_html_text.py

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,49 @@
11
# -*- coding: utf-8 -*-
2+
import pytest
23

34
from html_text import extract_text, parse_html
45

56

6-
def test_extract_text():
7+
@pytest.fixture(params=[{'guess_punct_space': True},
8+
{'guess_punct_space': False}])
9+
def all_options(request):
10+
return request.param
11+
12+
13+
def test_extract_text(all_options):
714
html = u'<html><style>.div {}</style><body><p>Hello, world!</body></html>'
8-
assert extract_text(html) == u'Hello, world!'
15+
assert extract_text(html, **all_options) == u'Hello, world!'
916

1017

11-
def test_declared_encoding():
18+
def test_declared_encoding(all_options):
1219
html = (u'<?xml version="1.0" encoding="utf-8" ?>'
1320
u'<html><style>.div {}</style>'
1421
u'<body>Hello, world!</p></body></html>')
15-
assert extract_text(html) == u'Hello, world!'
22+
assert extract_text(html, **all_options) == u'Hello, world!'
1623

1724

18-
def test_empty():
19-
assert extract_text(u'') == ''
25+
def test_empty(all_options):
26+
assert extract_text(u'', **all_options) == ''
2027

2128

22-
def test_extract_text_from_tree():
29+
def test_extract_text_from_tree(all_options):
2330
html = u'<html><style>.div {}</style><body><p>Hello, world!</body></html>'
2431
tree = parse_html(html)
25-
assert extract_text(tree) == u'Hello, world!'
32+
assert extract_text(tree, **all_options) == u'Hello, world!'
33+
34+
35+
def test_inline_tags_whitespace(all_options):
36+
html = u'<span>field</span><span>value of</span><span></span>'
37+
assert extract_text(html, **all_options) == u'field value of'
38+
39+
40+
def test_punct_whitespace():
41+
html = u'<div><span>field</span>, and more</div>'
42+
assert extract_text(html, guess_punct_space=False) == u'field , and more'
43+
44+
45+
def test_punct_whitespace_preserved():
46+
html = (u'<div><span>по</span><span>ле</span>, and , '
47+
u'<span>more </span>!<span>now</div>a (<b>boo</b>)')
48+
assert (extract_text(html, guess_punct_space=True) ==
49+
u'по ле, and , more ! now a (boo)')

0 commit comments

Comments
 (0)