diff --git a/README.rst b/README.rst
index 4c24d79..19e14d1 100644
--- a/README.rst
+++ b/README.rst
@@ -25,6 +25,10 @@ How is html_text different from ``.xpath('//text()')`` from LXML
or ``.get_text()`` from Beautiful Soup?
Text extracted with ``html_text`` does not contain inline styles,
javascript, comments and other text that is not normally visible to the users.
+It normalizes whitespace, but is also smarter than ``.xpath('normalize-space())``,
+adding spaces around inline elements too
+(which are often used as block elements in html markup),
+and tries to avoid adding extra spaces for punctuation.
Install
diff --git a/html_text/html_text.py b/html_text/html_text.py
index 532c3ac..56d220e 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -1,4 +1,6 @@
# -*- coding: utf-8 -*-
+import re
+
import lxml
import lxml.etree
from lxml.html.clean import Cleaner
@@ -39,10 +41,33 @@ def parse_html(html):
return lxml.html.fromstring(html.encode('utf8'), parser=parser)
-def selector_to_text(sel):
+_whitespace = re.compile(r'\s+')
+_has_trailing_whitespace = re.compile(r'\s$').search
+_has_punct_after = re.compile(r'^[,:;.!?"\)]').search
+_has_punct_before = re.compile(r'\($').search
+
+
+def selector_to_text(sel, guess_punct_space=True):
""" Convert a cleaned selector to text.
+ See html_text.extract_text docstring for description of the approach and options.
"""
- return sel.xpath('normalize-space()').extract_first('')
+ if guess_punct_space:
+
+ def fragments():
+ prev = None
+ for text in sel.xpath('//text()').extract():
+ if prev is not None and (_has_trailing_whitespace(prev)
+ or (not _has_punct_after(text) and
+ not _has_punct_before(prev))):
+ yield ' '
+ yield text
+ prev = text
+
+ return _whitespace.sub(' ', ''.join(fragments()).strip())
+
+ else:
+ fragments = (x.strip() for x in sel.xpath('//text()').extract())
+ return _whitespace.sub(' ', ' '.join(x for x in fragments if x))
def cleaned_selector(html):
@@ -60,10 +85,18 @@ def cleaned_selector(html):
return sel
-def extract_text(html, encoding='utf8'):
+def extract_text(html, guess_punct_space=True):
"""
Convert html to text.
+ Almost the same as normalize-space xpath, but this also
+ adds spaces between inline elements (like ) which are
+ often used as block elements in html markup.
+
+ When guess_punct_space is True (default), no extra whitespace is added
+ for punctuation. This has a slight (around 10%) performance overhead
+ and is just a heuristic.
html should be a unicode string or an already parsed lxml.html element.
"""
- return selector_to_text(cleaned_selector(html))
+ sel = cleaned_selector(html)
+ return selector_to_text(sel, guess_punct_space=guess_punct_space)
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index eedba9a..1205da7 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -1,25 +1,49 @@
# -*- coding: utf-8 -*-
+import pytest
from html_text import extract_text, parse_html
-def test_extract_text():
+@pytest.fixture(params=[{'guess_punct_space': True},
+ {'guess_punct_space': False}])
+def all_options(request):
+ return request.param
+
+
+def test_extract_text(all_options):
html = u' Hello, world!'
- assert extract_text(html) == u'Hello, world!'
+ assert extract_text(html, **all_options) == u'Hello, world!'
-def test_declared_encoding():
+def test_declared_encoding(all_options):
html = (u''
u''
u'
Hello, world!' tree = parse_html(html) - assert extract_text(tree) == u'Hello, world!' + assert extract_text(tree, **all_options) == u'Hello, world!' + + +def test_inline_tags_whitespace(all_options): + html = u'fieldvalue of' + assert extract_text(html, **all_options) == u'field value of' + + +def test_punct_whitespace(): + html = u'