Chore: allow table model to accept optional OCR data (#256)

yuming-long · web-flow · commit e8caa4eee018 · 2023-10-17T12:56:04.000-04:00
## Summary Change `run_prediction` in the table model to accept optional OCR data for table OCR refactor as an alternative to getting OCR tokens in `get_tokens`. ## TODO please see [CORE-2259](https://unstructured-ai.atlassian.net/browse/CORE-2259) to update `ocr_token` from dict to a data class. [CORE-2259]: https://unstructured-ai.atlassian.net/browse/CORE-2259?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.7.9
+
+* Allow table model to accept optional OCR tokens
+
 ## 0.7.8
 
 * Fix: include onnx as base dependency.
diff --git a/test_unstructured_inference/models/test_tables.py b/test_unstructured_inference/models/test_tables.py
@@ -361,6 +361,21 @@ def test_table_prediction_tesseract(table_transformer, example_image):
     ) in prediction
 
 
+def test_table_prediction_tesseract_with_ocr_tokens(table_transformer, example_image):
+    ocr_tokens = [
+        {
+            # bounding box should match table structure
+            "bbox": [70.0, 245.0, 127.0, 266.0],
+            "block_num": 0,
+            "line_num": 0,
+            "span_num": 0,
+            "text": "Blind",
+        },
+    ]
+    prediction = table_transformer.predict(example_image, ocr_tokens=ocr_tokens)
+    assert prediction == "<table><tr><td>Blind</td></tr></table>"
+
+
 @pytest.mark.skipif(skip_outside_ci, reason="Skipping paddle test run outside of CI")
 def test_table_prediction_paddle(monkeypatch, example_image):
     monkeypatch.setenv("TABLE_OCR", "paddle")
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.7.8"  # pragma: no cover
+__version__ = "0.7.9"  # pragma: no cover
diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py
@@ -5,7 +5,7 @@
 import xml.etree.ElementTree as ET
 from collections import defaultdict
 from pathlib import Path
-from typing import List, Optional, Union
+from typing import Dict, List, Optional, Union
 
 import cv2
 import numpy as np
@@ -33,10 +33,24 @@ class UnstructuredTableTransformerModel(UnstructuredModel):
     def __init__(self):
         pass
 
-    def predict(self, x: Image):
-        """Predict table structure deferring to run_prediction"""
+    def predict(self, x: Image, ocr_tokens: Optional[List[Dict]] = None):
+        """Predict table structure deferring to run_prediction with ocr tokens
+
+        Note:
+        `ocr_tokens` is a list of dictionaries representing OCR tokens,
+        where each dictionary has the following format:
+        {
+            "bbox": [int, int, int, int],  # Bounding box coordinates of the token
+            "block_num": int,  # Block number
+            "line_num": int,   # Line number
+            "span_num": int,   # Span number
+            "text": str,  # Text content of the token
+        }
+        The bounding box coordinates should match the table structure.
+        FIXME: refactor token data into a dataclass so we have clear expectations of the fields
+        """
         super().predict(x)
-        return self.run_prediction(x)
+        return self.run_prediction(x, ocr_tokens=ocr_tokens)
 
     def initialize(
         self,
@@ -161,12 +175,18 @@ def run_prediction(
         self,
         x: Image,
         pad_for_structure_detection: int = inference_config.TABLE_IMAGE_BACKGROUND_PAD,
+        ocr_tokens: Optional[List[Dict]] = None,
     ):
         """Predict table structure"""
         outputs_structure = self.get_structure(x, pad_for_structure_detection)
-        tokens = self.get_tokens(x=x)
+        if ocr_tokens is None:
+            logger.warning(
+                "Table OCR from get_tokens method will be deprecated. "
+                "In the future the OCR tokens are expected to be passed in.",
+            )
+            ocr_tokens = self.get_tokens(x=x)
 
-        html = recognize(outputs_structure, x, tokens=tokens, out_html=True)["html"]
+        html = recognize(outputs_structure, x, tokens=ocr_tokens, out_html=True)["html"]
         prediction = html[0] if html else ""
         return prediction
 

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.7.8" # pragma: no cover`
	`1`	`+__version__ = "0.7.9" # pragma: no cover`