|
1 | 1 | # -*- coding: utf-8 -*- |
| 2 | +import pytest |
2 | 3 |
|
3 | 4 | from html_text import extract_text, parse_html |
4 | 5 |
|
5 | 6 |
|
6 | | -def test_extract_text(): |
| 7 | +@pytest.fixture(params=[{'guess_punct_space': True}, |
| 8 | + {'guess_punct_space': False}]) |
| 9 | +def all_options(request): |
| 10 | + return request.param |
| 11 | + |
| 12 | + |
| 13 | +def test_extract_text(all_options): |
7 | 14 | html = u'<html><style>.div {}</style><body><p>Hello, world!</body></html>' |
8 | | - assert extract_text(html) == u'Hello, world!' |
| 15 | + assert extract_text(html, **all_options) == u'Hello, world!' |
9 | 16 |
|
10 | 17 |
|
11 | | -def test_declared_encoding(): |
| 18 | +def test_declared_encoding(all_options): |
12 | 19 | html = (u'<?xml version="1.0" encoding="utf-8" ?>' |
13 | 20 | u'<html><style>.div {}</style>' |
14 | 21 | u'<body>Hello, world!</p></body></html>') |
15 | | - assert extract_text(html) == u'Hello, world!' |
| 22 | + assert extract_text(html, **all_options) == u'Hello, world!' |
16 | 23 |
|
17 | 24 |
|
18 | | -def test_empty(): |
19 | | - assert extract_text(u'') == '' |
| 25 | +def test_empty(all_options): |
| 26 | + assert extract_text(u'', **all_options) == '' |
20 | 27 |
|
21 | 28 |
|
22 | | -def test_extract_text_from_tree(): |
| 29 | +def test_extract_text_from_tree(all_options): |
23 | 30 | html = u'<html><style>.div {}</style><body><p>Hello, world!</body></html>' |
24 | 31 | tree = parse_html(html) |
25 | | - assert extract_text(tree) == u'Hello, world!' |
| 32 | + assert extract_text(tree, **all_options) == u'Hello, world!' |
| 33 | + |
| 34 | + |
| 35 | +def test_inline_tags_whitespace(all_options): |
| 36 | + html = u'<span>field</span><span>value of</span><span></span>' |
| 37 | + assert extract_text(html, **all_options) == u'field value of' |
| 38 | + |
| 39 | + |
| 40 | +def test_punct_whitespace(): |
| 41 | + html = u'<div><span>field</span>, and more</div>' |
| 42 | + assert extract_text(html, guess_punct_space=False) == u'field , and more' |
| 43 | + |
| 44 | + |
| 45 | +def test_punct_whitespace_preserved(): |
| 46 | + html = (u'<div><span>по</span><span>ле</span>, and , ' |
| 47 | + u'<span>more </span>!<span>now</div>a (<b>boo</b>)') |
| 48 | + assert (extract_text(html, guess_punct_space=True) == |
| 49 | + u'по ле, and , more ! now a (boo)') |
0 commit comments