Skip to content

Commit e602972

Browse files
authored
fix: reformat chipper table element to match standard format (#279)
This PR addresses [CORE-2485](https://unstructured-ai.atlassian.net/browse/CORE-2485) - now table's html representation is in `text_as_html` attribute - the attribute `text` now only contains text without any html tags - add a new utils to strip tags from html strings [CORE-2485]: https://unstructured-ai.atlassian.net/browse/CORE-2485?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ
1 parent 76f9900 commit e602972

File tree

6 files changed

+68
-6
lines changed

6 files changed

+68
-6
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
## 0.7.11-dev3
1+
## 0.7.11
22

33
* chore: remove logger info for chipper since its private
44
* fix: update broken slack invite link in chipper logger info
55
* enhancement: Improve error message when # images extracted doesn't match # page layouts.
66
* fix: use automatic mixed precision on GPU for Chipper
7+
* fix: chipper Table elements now match other layout models' Table element format: html representation is stored in `text_as_html` attribute and `text` attribute stores text without html tags
78

89
## 0.7.10
910

test_unstructured_inference/models/test_chippermodel.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,4 +243,5 @@ def test_run_chipper_v2():
243243
img = Image.open("sample-docs/easy_table.jpg")
244244
elements = model(img)
245245
tables = [el for el in elements if el.type == "Table"]
246-
assert tables
246+
assert all(table.text_as_html.startswith("<table>") for table in tables)
247+
assert all("<table>" not in table.text for table in tables)

test_unstructured_inference/test_utils.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
LazyEvaluateInfo,
1515
annotate_layout_elements,
1616
pad_image_with_background_color,
17+
strip_tags,
1718
write_image,
1819
)
1920

@@ -147,3 +148,17 @@ def test_pad_image_with_background_color(mock_pil_image):
147148
def test_pad_image_with_invalid_input(mock_pil_image):
148149
with pytest.raises(ValueError, match="Can not pad an image with negative space!"):
149150
pad_image_with_background_color(mock_pil_image, -1)
151+
152+
153+
@pytest.mark.parametrize(
154+
("html", "text"),
155+
[
156+
("<table>Table</table>", "Table"),
157+
# test escaped character
158+
("<table>y&ltx, x&gtz</table>", "y<x, x>z"),
159+
# test tag with parameters
160+
("<table format=foo>Table", "Table"),
161+
],
162+
)
163+
def test_strip_tags(html, text):
164+
assert strip_tags(html) == text
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.7.11-dev3" # pragma: no cover
1+
__version__ = "0.7.11" # pragma: no cover

unstructured_inference/models/chipper.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,10 @@
1818
from unstructured_inference.inference.elements import Rectangle
1919
from unstructured_inference.inference.layoutelement import LayoutElement
2020
from unstructured_inference.logger import logger
21-
from unstructured_inference.models.unstructuredmodel import UnstructuredElementExtractionModel
22-
from unstructured_inference.utils import LazyDict
21+
from unstructured_inference.models.unstructuredmodel import (
22+
UnstructuredElementExtractionModel,
23+
)
24+
from unstructured_inference.utils import LazyDict, strip_tags
2325

2426
MODEL_TYPES: Dict[Optional[str], Union[LazyDict, dict]] = {
2527
"chipperv1": {
@@ -140,7 +142,22 @@ def initialize(
140142
def predict(self, image) -> List[LayoutElement]:
141143
"""Do inference using the wrapped model."""
142144
tokens, decoder_cross_attentions = self.predict_tokens(image)
143-
elements = self.postprocess(image, tokens, decoder_cross_attentions)
145+
elements = self.format_table_elements(
146+
self.postprocess(image, tokens, decoder_cross_attentions),
147+
)
148+
return elements
149+
150+
@staticmethod
151+
def format_table_elements(elements):
152+
"""makes chipper table element return the same as other layout models
153+
154+
- copies the html representation to attribute text_as_html
155+
- strip html tags from the attribute text
156+
"""
157+
for element in elements:
158+
element.text_as_html = element.text
159+
element.text = strip_tags(element.text)
160+
144161
return elements
145162

146163
def predict_tokens(

unstructured_inference/utils.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import os
22
from collections.abc import Mapping
3+
from html.parser import HTMLParser
4+
from io import StringIO
35
from typing import TYPE_CHECKING, Any, Callable, Hashable, Iterable, Iterator, Union
46

57
import cv2
@@ -154,3 +156,29 @@ def pad_image_with_background_color(
154156
new = Image.new(image.mode, (width + pad * 2, height + pad * 2), background_color)
155157
new.paste(image, (pad, pad))
156158
return new
159+
160+
161+
class MLStripper(HTMLParser):
162+
"""simple markup language stripper that helps to strip tags from string"""
163+
164+
def __init__(self):
165+
super().__init__()
166+
self.reset()
167+
self.strict = True
168+
self.convert_charrefs = True
169+
self.text = StringIO()
170+
171+
def handle_data(self, d):
172+
"""process input data"""
173+
self.text.write(d)
174+
175+
def get_data(self):
176+
"""performs stripping by get the value of text"""
177+
return self.text.getvalue()
178+
179+
180+
def strip_tags(html: str) -> str:
181+
"""stripping html tags from input string and return string without tags"""
182+
s = MLStripper()
183+
s.feed(html)
184+
return s.get_data()

0 commit comments

Comments
 (0)