fix: reformat chipper table element to match standard format (#279)

badGarnet · web-flow · commit e602972e05e9 · 2023-11-06T18:05:36.000Z
This PR addresses [CORE-2485](https://unstructured-ai.atlassian.net/browse/CORE-2485) - now table's html representation is in `text_as_html` attribute - the attribute `text` now only contains text without any html tags - add a new utils to strip tags from html strings [CORE-2485]: https://unstructured-ai.atlassian.net/browse/CORE-2485?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,10 @@
-## 0.7.11-dev3
+## 0.7.11
 
 * chore: remove logger info for chipper since its private
 * fix: update broken slack invite link in chipper logger info
 * enhancement: Improve error message when # images extracted doesn't match # page layouts.
 * fix: use automatic mixed precision on GPU for Chipper
+* fix: chipper Table elements now match other layout models' Table element format: html representation is stored in `text_as_html` attribute and `text` attribute stores text without html tags
 
 ## 0.7.10
 
diff --git a/test_unstructured_inference/models/test_chippermodel.py b/test_unstructured_inference/models/test_chippermodel.py
@@ -243,4 +243,5 @@ def test_run_chipper_v2():
     img = Image.open("sample-docs/easy_table.jpg")
     elements = model(img)
     tables = [el for el in elements if el.type == "Table"]
-    assert tables
+    assert all(table.text_as_html.startswith("<table>") for table in tables)
+    assert all("<table>" not in table.text for table in tables)
diff --git a/test_unstructured_inference/test_utils.py b/test_unstructured_inference/test_utils.py
@@ -14,6 +14,7 @@
     LazyEvaluateInfo,
     annotate_layout_elements,
     pad_image_with_background_color,
+    strip_tags,
     write_image,
 )
 
@@ -147,3 +148,17 @@ def test_pad_image_with_background_color(mock_pil_image):
 def test_pad_image_with_invalid_input(mock_pil_image):
     with pytest.raises(ValueError, match="Can not pad an image with negative space!"):
         pad_image_with_background_color(mock_pil_image, -1)
+
+
+@pytest.mark.parametrize(
+    ("html", "text"),
+    [
+        ("<table>Table</table>", "Table"),
+        # test escaped character
+        ("<table>y&ltx, x&gtz</table>", "y<x, x>z"),
+        # test tag with parameters
+        ("<table format=foo>Table", "Table"),
+    ],
+)
+def test_strip_tags(html, text):
+    assert strip_tags(html) == text
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.7.11-dev3"  # pragma: no cover
+__version__ = "0.7.11"  # pragma: no cover
diff --git a/unstructured_inference/models/chipper.py b/unstructured_inference/models/chipper.py
@@ -18,8 +18,10 @@
 from unstructured_inference.inference.elements import Rectangle
 from unstructured_inference.inference.layoutelement import LayoutElement
 from unstructured_inference.logger import logger
-from unstructured_inference.models.unstructuredmodel import UnstructuredElementExtractionModel
-from unstructured_inference.utils import LazyDict
+from unstructured_inference.models.unstructuredmodel import (
+    UnstructuredElementExtractionModel,
+)
+from unstructured_inference.utils import LazyDict, strip_tags
 
 MODEL_TYPES: Dict[Optional[str], Union[LazyDict, dict]] = {
     "chipperv1": {
@@ -140,7 +142,22 @@ def initialize(
     def predict(self, image) -> List[LayoutElement]:
         """Do inference using the wrapped model."""
         tokens, decoder_cross_attentions = self.predict_tokens(image)
-        elements = self.postprocess(image, tokens, decoder_cross_attentions)
+        elements = self.format_table_elements(
+            self.postprocess(image, tokens, decoder_cross_attentions),
+        )
+        return elements
+
+    @staticmethod
+    def format_table_elements(elements):
+        """makes chipper table element return the same as other layout models
+
+        - copies the html representation to attribute text_as_html
+        - strip html tags from the attribute text
+        """
+        for element in elements:
+            element.text_as_html = element.text
+            element.text = strip_tags(element.text)
+
         return elements
 
     def predict_tokens(
diff --git a/unstructured_inference/utils.py b/unstructured_inference/utils.py
@@ -1,5 +1,7 @@
 import os
 from collections.abc import Mapping
+from html.parser import HTMLParser
+from io import StringIO
 from typing import TYPE_CHECKING, Any, Callable, Hashable, Iterable, Iterator, Union
 
 import cv2
@@ -154,3 +156,29 @@ def pad_image_with_background_color(
     new = Image.new(image.mode, (width + pad * 2, height + pad * 2), background_color)
     new.paste(image, (pad, pad))
     return new
+
+
+class MLStripper(HTMLParser):
+    """simple markup language stripper that helps to strip tags from string"""
+
+    def __init__(self):
+        super().__init__()
+        self.reset()
+        self.strict = True
+        self.convert_charrefs = True
+        self.text = StringIO()
+
+    def handle_data(self, d):
+        """process input data"""
+        self.text.write(d)
+
+    def get_data(self):
+        """performs stripping by get the value of text"""
+        return self.text.getvalue()
+
+
+def strip_tags(html: str) -> str:
+    """stripping html tags from input string and return string without tags"""
+    s = MLStripper()
+    s.feed(html)
+    return s.get_data()

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.7.11-dev3" # pragma: no cover`
	`1`	`+__version__ = "0.7.11" # pragma: no cover`