diff --git a/README.rst b/README.rst index 4c24d79..19e14d1 100644 --- a/README.rst +++ b/README.rst @@ -25,6 +25,10 @@ How is html_text different from ``.xpath('//text()')`` from LXML or ``.get_text()`` from Beautiful Soup? Text extracted with ``html_text`` does not contain inline styles, javascript, comments and other text that is not normally visible to the users. +It normalizes whitespace, but is also smarter than ``.xpath('normalize-space())``, +adding spaces around inline elements too +(which are often used as block elements in html markup), +and tries to avoid adding extra spaces for punctuation. Install diff --git a/html_text/html_text.py b/html_text/html_text.py index 532c3ac..56d220e 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +import re + import lxml import lxml.etree from lxml.html.clean import Cleaner @@ -39,10 +41,33 @@ def parse_html(html): return lxml.html.fromstring(html.encode('utf8'), parser=parser) -def selector_to_text(sel): +_whitespace = re.compile(r'\s+') +_has_trailing_whitespace = re.compile(r'\s$').search +_has_punct_after = re.compile(r'^[,:;.!?"\)]').search +_has_punct_before = re.compile(r'\($').search + + +def selector_to_text(sel, guess_punct_space=True): """ Convert a cleaned selector to text. + See html_text.extract_text docstring for description of the approach and options. """ - return sel.xpath('normalize-space()').extract_first('') + if guess_punct_space: + + def fragments(): + prev = None + for text in sel.xpath('//text()').extract(): + if prev is not None and (_has_trailing_whitespace(prev) + or (not _has_punct_after(text) and + not _has_punct_before(prev))): + yield ' ' + yield text + prev = text + + return _whitespace.sub(' ', ''.join(fragments()).strip()) + + else: + fragments = (x.strip() for x in sel.xpath('//text()').extract()) + return _whitespace.sub(' ', ' '.join(x for x in fragments if x)) def cleaned_selector(html): @@ -60,10 +85,18 @@ def cleaned_selector(html): return sel -def extract_text(html, encoding='utf8'): +def extract_text(html, guess_punct_space=True): """ Convert html to text. + Almost the same as normalize-space xpath, but this also + adds spaces between inline elements (like ) which are + often used as block elements in html markup. + + When guess_punct_space is True (default), no extra whitespace is added + for punctuation. This has a slight (around 10%) performance overhead + and is just a heuristic. html should be a unicode string or an already parsed lxml.html element. """ - return selector_to_text(cleaned_selector(html)) + sel = cleaned_selector(html) + return selector_to_text(sel, guess_punct_space=guess_punct_space) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index eedba9a..1205da7 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -1,25 +1,49 @@ # -*- coding: utf-8 -*- +import pytest from html_text import extract_text, parse_html -def test_extract_text(): +@pytest.fixture(params=[{'guess_punct_space': True}, + {'guess_punct_space': False}]) +def all_options(request): + return request.param + + +def test_extract_text(all_options): html = u'

Hello, world!' - assert extract_text(html) == u'Hello, world!' + assert extract_text(html, **all_options) == u'Hello, world!' -def test_declared_encoding(): +def test_declared_encoding(all_options): html = (u'' u'' u'Hello, world!

') - assert extract_text(html) == u'Hello, world!' + assert extract_text(html, **all_options) == u'Hello, world!' -def test_empty(): - assert extract_text(u'') == '' +def test_empty(all_options): + assert extract_text(u'', **all_options) == '' -def test_extract_text_from_tree(): +def test_extract_text_from_tree(all_options): html = u'

Hello, world!' tree = parse_html(html) - assert extract_text(tree) == u'Hello, world!' + assert extract_text(tree, **all_options) == u'Hello, world!' + + +def test_inline_tags_whitespace(all_options): + html = u'fieldvalue of' + assert extract_text(html, **all_options) == u'field value of' + + +def test_punct_whitespace(): + html = u'

field, and more
' + assert extract_text(html, guess_punct_space=False) == u'field , and more' + + +def test_punct_whitespace_preserved(): + html = (u'
поле, and , ' + u'more !now
a (boo)') + assert (extract_text(html, guess_punct_space=True) == + u'по ле, and , more ! now a (boo)')