diff --git a/marker/config/parser.py b/marker/config/parser.py
index 8e6bd8de0..1af9c2dc2 100644
--- a/marker/config/parser.py
+++ b/marker/config/parser.py
@@ -60,6 +60,12 @@ def common_options(fn):
default=False,
help="Disable image extraction.",
)(fn)
+ fn = click.option(
+ "--mathpix",
+ is_flag=True,
+ default=False,
+ help="Use Mathpix for equation processing.",
+ )(fn)
# these are options that need a list transformation, i.e splitting/parsing a string
fn = click.option(
@@ -106,6 +112,8 @@ def generate_config_dict(self) -> Dict[str, any]:
config["pdftext_workers"] = 1
case "disable_image_extraction":
config["extract_images"] = False
+ case "mathpix":
+ config["use_mathpix"] = v
case _:
if k in crawler.attr_set:
config[k] = v
diff --git a/marker/converters/__init__.py b/marker/converters/__init__.py
index 8357a4991..be56fbfe2 100644
--- a/marker/converters/__init__.py
+++ b/marker/converters/__init__.py
@@ -44,7 +44,7 @@ def initialize_processors(self, processor_cls_lst: List[Type[BaseProcessor]]) ->
processors = []
for processor_cls in processor_cls_lst:
processors.append(self.resolve_dependencies(processor_cls))
-
+
simple_llm_processors = [p for p in processors if issubclass(type(p), BaseLLMSimpleBlockProcessor)]
other_processors = [p for p in processors if not issubclass(type(p), BaseLLMSimpleBlockProcessor)]
diff --git a/marker/converters/ocr.py b/marker/converters/ocr.py
index f7e562553..38ee71dd1 100644
--- a/marker/converters/ocr.py
+++ b/marker/converters/ocr.py
@@ -8,6 +8,8 @@
from marker.processors.equation import EquationProcessor
from marker.providers.registry import provider_from_filepath
from marker.renderers.ocr_json import OCRJSONRenderer
+from marker.providers.mathpix import MathpixProvider
+from marker.settings import settings
class OCRConverter(PdfConverter):
@@ -21,6 +23,12 @@ def __init__(self, *args, **kwargs):
self.config["format_lines"] = True
self.renderer = OCRJSONRenderer
+
+ # Initialize Mathpix provider
+ self.mathpix_provider = MathpixProvider(
+ app_id=settings.MATHPIX_APP_ID,
+ app_key=settings.MATHPIX_APP_KEY
+ )
def build_document(self, filepath: str):
provider_cls = provider_from_filepath(filepath)
@@ -32,6 +40,7 @@ def build_document(self, filepath: str):
provider = provider_cls(filepath, self.config)
document = document_builder(provider, layout_builder, line_builder, ocr_builder)
+ # Initialize processors
for processor in self.processor_list:
processor(document)
diff --git a/marker/processors/equation.py b/marker/processors/equation.py
index 36124645c..83f71fb86 100644
--- a/marker/processors/equation.py
+++ b/marker/processors/equation.py
@@ -4,21 +4,19 @@
from bs4 import BeautifulSoup
from ftfy import fix_text, TextFixerConfig
-from surya.recognition import RecognitionPredictor, OCRResult
-
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document
from marker.settings import settings
+from marker.providers.mathpix import MathpixProvider
+from surya.recognition import RecognitionPredictor, OCRResult
MATH_TAG_PATTERN = re.compile(r"")
-
class EquationProcessor(BaseProcessor):
"""
A processor for recognizing equations in the document.
"""
-
block_types: Annotated[
Tuple[BlockTypes],
"The block types to process.",
@@ -36,12 +34,25 @@ class EquationProcessor(BaseProcessor):
bool,
"Whether to disable the tqdm progress bar.",
] = False
+ use_mathpix: Annotated[
+ bool,
+ "Whether to use Mathpix for equation processing.",
+ ] = False
def __init__(self, recognition_model: RecognitionPredictor, config=None):
super().__init__(config)
-
self.recognition_model = recognition_model
+ if self.use_mathpix == True:
+ if not settings.MATHPIX_APP_ID or not settings.MATHPIX_APP_KEY:
+ raise ValueError("Mathpix API credentials not configured")
+ self.mathpix_provider = MathpixProvider(
+ app_id=settings.MATHPIX_APP_ID,
+ app_key=settings.MATHPIX_APP_KEY
+ )
+ # Add TextInlineMath to block types when Mathpix is enabled
+ self.block_types = (BlockTypes.Equation, BlockTypes.TextInlineMath)
+
def get_batch_size(self):
# Set to 1/4th of OCR batch size due to sequence length with tiling
if self.equation_batch_size is not None:
@@ -80,6 +91,35 @@ def __call__(self, document: Document):
if total_equation_blocks == 0:
return
+ if self.use_mathpix:
+ self._process_with_mathpix(images, equation_boxes, equation_block_ids, document)
+ else:
+ self._process_with_recognition(images, equation_boxes, equation_block_ids, document)
+
+ def _process_with_mathpix(self, images, equation_boxes, equation_block_ids, document):
+ for page_idx, (page_image, page_boxes, page_block_ids) in enumerate(
+ zip(images, equation_boxes, equation_block_ids)
+ ):
+ for box_idx, (box, block_id) in enumerate(zip(page_boxes, page_block_ids)):
+ # Crop the equation from the page
+ x1, y1, x2, y2 = [int(coord) for coord in box]
+ equation_image = page_image.crop((x1, y1, x2, y2))
+
+ # Process with Mathpix
+ try:
+ result = self.mathpix_provider.process_equation(equation_image)
+
+ # Extract LaTeX from the result
+ latex = result.get('latex_styled', '')
+ if latex:
+ # Wrap in math tags
+ block = document.get_block(block_id)
+ block.html = self.fix_latex(f'')
+ except Exception as e:
+ print(f"Error processing equation {block_id}: {str(e)}")
+ continue
+
+ def _process_with_recognition(self, images, equation_boxes, equation_block_ids, document):
predictions = self.get_latex_batched(images, equation_boxes)
for page_predictions, page_equation_block_ids in zip(
predictions, equation_block_ids
@@ -138,3 +178,5 @@ def get_latex_batched(
]
return equation_predictions
+
+
diff --git a/marker/providers/mathpix.py b/marker/providers/mathpix.py
new file mode 100644
index 000000000..daa35bcb2
--- /dev/null
+++ b/marker/providers/mathpix.py
@@ -0,0 +1,58 @@
+from typing import Optional, Dict, Any
+import requests
+from PIL import Image
+import io
+import base64
+
+class MathpixProvider:
+ def __init__(self, app_id: str, app_key: str):
+ self.app_id = app_id
+ self.app_key = app_key
+ self.api_url = "https://api.mathpix.com/v3/text"
+
+ def _encode_image(self, image: Image.Image) -> str:
+ """Convert PIL Image to base64 string"""
+ buffered = io.BytesIO()
+ image.save(buffered, format="PNG")
+ return base64.b64encode(buffered.getvalue()).decode()
+
+ def process_equation(self, image: Image.Image, options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+ """
+ Process an equation image using Mathpix API
+
+ Args:
+ image: PIL Image containing the equation
+ options: Additional options for Mathpix API
+
+ Returns:
+ Dict containing the processed equation data
+ """
+ if options is None:
+ options = {}
+
+ # Prepare the request
+ headers = {
+ "app_id": self.app_id,
+ "app_key": self.app_key,
+ "Content-Type": "application/json"
+ }
+
+ # Convert image to base64
+ image_data = self._encode_image(image)
+
+ # Prepare request body
+ data = {
+ "src": f"data:image/png;base64,{image_data}",
+ "formats": ["text", "latex_styled"],
+ "data_options": {
+ "include_asciimath": True,
+ "include_latex": True
+ },
+ **options
+ }
+
+ # Make API request
+ response = requests.post(self.api_url, headers=headers, json=data)
+ response.raise_for_status()
+
+ return response.json()
\ No newline at end of file
diff --git a/marker/settings.py b/marker/settings.py
index 5660ada90..d5d606150 100644
--- a/marker/settings.py
+++ b/marker/settings.py
@@ -1,11 +1,13 @@
-from typing import Optional
+from typing import Optional, Literal
-from dotenv import find_dotenv
+from dotenv import find_dotenv, load_dotenv
from pydantic import computed_field
from pydantic_settings import BaseSettings
import torch
import os
+# Load environment variables from .env file
+load_dotenv(find_dotenv(".env"))
class Settings(BaseSettings):
# Paths
@@ -30,6 +32,10 @@ class Settings(BaseSettings):
None # Note: MPS device does not work for text detection, and will default to CPU
)
+ # Equation processing settings
+ MATHPIX_APP_ID: str = os.getenv("MATHPIX_APP_ID", "")
+ MATHPIX_APP_KEY: str = os.getenv("MATHPIX_APP_KEY", "")
+
@computed_field
@property
def TORCH_DEVICE_MODEL(self) -> str:
diff --git a/poetry.lock b/poetry.lock
index 90b66ece0..29216101b 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand.
[[package]]
name = "aiohappyeyeballs"
@@ -3366,10 +3366,10 @@ files = [
[package.dependencies]
numpy = [
- {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\""},
- {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\""},
- {version = ">=1.23.5", markers = "python_version >= \"3.11\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
+ {version = ">=1.23.5", markers = "python_version == \"3.11\""},
+ {version = ">=1.21.4", markers = "python_version == \"3.10\" and platform_system == \"Darwin\""},
+ {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version == \"3.10\""},
]
[[package]]
@@ -3466,9 +3466,9 @@ files = [
[package.dependencies]
numpy = [
- {version = ">=1.22.4", markers = "python_version < \"3.11\""},
- {version = ">=1.23.2", markers = "python_version == \"3.11\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
+ {version = ">=1.23.2", markers = "python_version == \"3.11\""},
+ {version = ">=1.22.4", markers = "python_version < \"3.11\""},
]
python-dateutil = ">=2.8.2"
pytz = ">=2020.1"
@@ -3532,18 +3532,20 @@ name = "pdftext"
version = "0.6.2"
description = "Extract structured text from pdfs quickly"
optional = false
-python-versions = "<4.0,>=3.10"
+python-versions = "^3.10"
groups = ["main"]
-files = [
- {file = "pdftext-0.6.2-py3-none-any.whl", hash = "sha256:905d11e62d548e307933c25865a69c8e993947bb5b40b1535b0a2aa8f07a71d4"},
- {file = "pdftext-0.6.2.tar.gz", hash = "sha256:ff5b92462ac03ae63a23429384ae123d45c162dcda30e7bf2c5c92a6b208c9de"},
-]
+files = []
+develop = true
[package.dependencies]
-click = ">=8.1.8,<9.0.0"
-pydantic = ">=2.7.1,<3.0.0"
-pydantic-settings = ">=2.2.1,<3.0.0"
-pypdfium2 = "4.30.0"
+click = "^8.1.8"
+pydantic = "^2.7.1"
+pydantic-settings = "^2.2.1"
+pypdfium2 = "=4.30.1"
+
+[package.source]
+type = "directory"
+url = "../pdftext"
[[package]]
name = "pexpect"
@@ -4264,25 +4266,25 @@ windows-terminal = ["colorama (>=0.4.6)"]
[[package]]
name = "pypdfium2"
-version = "4.30.0"
+version = "4.30.1"
description = "Python bindings to PDFium"
optional = false
python-versions = ">=3.6"
groups = ["main"]
files = [
- {file = "pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab"},
- {file = "pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de"},
- {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854"},
- {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2"},
- {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad"},
- {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f"},
- {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163"},
- {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e"},
- {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be"},
- {file = "pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e"},
- {file = "pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c"},
- {file = "pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29"},
- {file = "pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16"},
+ {file = "pypdfium2-4.30.1-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:e07c47633732cc18d890bb7e965ad28a9c5a932e548acb928596f86be2e5ae37"},
+ {file = "pypdfium2-4.30.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:5ea2d44e96d361123b67b00f527017aa9c847c871b5714e013c01c3eb36a79fe"},
+ {file = "pypdfium2-4.30.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1de7a3a36803171b3f66911131046d65a732f9e7834438191cb58235e6163c4e"},
+ {file = "pypdfium2-4.30.1-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b8a4231efb13170354f568c722d6540b8d5b476b08825586d48ef70c40d16e03"},
+ {file = "pypdfium2-4.30.1-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f434a4934e8244aa95343ffcf24e9ad9f120dbb4785f631bb40a88c39292493"},
+ {file = "pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f454032a0bc7681900170f67d8711b3942824531e765f91c2f5ce7937f999794"},
+ {file = "pypdfium2-4.30.1-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:bbf9130a72370ee9d602e39949b902db669a2a1c24746a91e5586eb829055d9f"},
+ {file = "pypdfium2-4.30.1-py3-none-musllinux_1_1_i686.whl", hash = "sha256:5cb52884b1583b96e94fd78542c63bb42e06df5e8f9e52f8f31f5ad5a1e53367"},
+ {file = "pypdfium2-4.30.1-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:1a9e372bd4867ff223cc8c338e33fe11055dad12f22885950fc27646cc8d9122"},
+ {file = "pypdfium2-4.30.1-py3-none-win32.whl", hash = "sha256:421f1cf205e213e07c1f2934905779547f4f4a2ff2f59dde29da3d511d3fc806"},
+ {file = "pypdfium2-4.30.1-py3-none-win_amd64.whl", hash = "sha256:598a7f20264ab5113853cba6d86c4566e4356cad037d7d1f849c8c9021007e05"},
+ {file = "pypdfium2-4.30.1-py3-none-win_arm64.whl", hash = "sha256:c2b6d63f6d425d9416c08d2511822b54b8e3ac38e639fc41164b1d75584b3a8c"},
+ {file = "pypdfium2-4.30.1.tar.gz", hash = "sha256:5f5c7c6d03598e107d974f66b220a49436aceb191da34cda5f692be098a814ce"},
]
[[package]]
@@ -5390,30 +5392,32 @@ snowflake = ["snowflake-connector-python (>=3.3.0) ; python_version < \"3.12\"",
[[package]]
name = "surya-ocr"
-version = "0.14.2"
+version = "0.14.5"
description = "OCR, layout, reading order, and table recognition in 90+ languages"
optional = false
-python-versions = "<4.0,>=3.10"
+python-versions = "^3.10"
groups = ["main"]
-files = [
- {file = "surya_ocr-0.14.2-py3-none-any.whl", hash = "sha256:0c402705c860f8bf98fc2bf2a3b49d7f0e16fba587aed6d3f01bb53bb776d283"},
- {file = "surya_ocr-0.14.2.tar.gz", hash = "sha256:852af681073167beba9a638658c70b81318f1a8f3d558db68dead1b2c391e862"},
-]
+files = []
+develop = true
[package.dependencies]
-click = ">=8.1.8,<9.0.0"
-einops = ">=0.8.1,<0.9.0"
-filetype = ">=1.2.0,<2.0.0"
-opencv-python-headless = ">=4.11.0.86,<5.0.0.0"
-pillow = ">=10.2.0,<11.0.0"
-platformdirs = ">=4.3.6,<5.0.0"
-pre-commit = ">=4.2.0,<5.0.0"
-pydantic = ">=2.5.3,<3.0.0"
-pydantic-settings = ">=2.1.0,<3.0.0"
-pypdfium2 = "4.30.0"
-python-dotenv = ">=1.0.0,<2.0.0"
-torch = ">=2.7.0,<3.0.0"
-transformers = ">=4.51.2,<5.0.0"
+click = "^8.1.8"
+einops = "^0.8.1"
+filetype = "^1.2.0"
+opencv-python-headless = "^4.11.0.86"
+pillow = "^10.2.0"
+platformdirs = "^4.3.6"
+pre-commit = "^4.2.0"
+pydantic = "^2.5.3"
+pydantic-settings = "^2.1.0"
+pypdfium2 = "=4.30.1"
+python-dotenv = "^1.0.0"
+torch = "^2.7.0"
+transformers = "^4.51.2"
+
+[package.source]
+type = "directory"
+url = "../surya"
[[package]]
name = "sympy"
@@ -6505,4 +6509,4 @@ full = ["ebooklib", "mammoth", "openpyxl", "python-pptx", "weasyprint"]
[metadata]
lock-version = "2.1"
python-versions = "^3.10"
-content-hash = "c18debb8d18aec4081c31ff32f9dc2bde6f4c0189f1d7647bb6061f685e0e319"
+content-hash = "484459202f1148269601972c07e461c46f51f424919485090b707b350ce7fd74"
diff --git a/pyproject.toml b/pyproject.toml
index 2056da009..a5f82aea4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,9 +26,9 @@ torch = "^2.7.0"
tqdm = "^4.66.1"
ftfy = "^6.1.1"
rapidfuzz = "^3.8.1"
-surya-ocr = "^0.14.2"
+surya-ocr = {path = "../surya", develop = true}
regex = "^2024.4.28"
-pdftext = "~0.6.2"
+pdftext = {path = "../pdftext", develop = true}
markdownify = "^0.13.1"
click = "^8.2.0"
markdown2 = "^2.5.2"
@@ -75,4 +75,4 @@ marker_server = "marker.scripts.server:server_cli"
[build-system]
requires = ["poetry-core"]
-build-backend = "poetry.core.masonry.api"
\ No newline at end of file
+build-backend = "poetry.core.masonry.api"
diff --git a/test_clipping.py b/test_clipping.py
new file mode 100644
index 000000000..7fc9d65a4
--- /dev/null
+++ b/test_clipping.py
@@ -0,0 +1,110 @@
+import os
+import argparse
+import pypdfium2 as pdfium
+from PIL import Image, ImageDraw
+import pypdfium2.raw as pdfium_c
+import ctypes
+
+def get_text_from_raw_text_obj(raw_text_obj, page):
+ textpage = pdfium_c.FPDFText_LoadPage(page.raw)
+ if not textpage:
+ return ""
+ try:
+ buflen = pdfium_c.FPDFTextObj_GetText(raw_text_obj, textpage, None, 0)
+ if buflen <= 0:
+ return ""
+ buf = (ctypes.c_ushort * buflen)()
+ pdfium_c.FPDFTextObj_GetText(raw_text_obj, textpage, buf, buflen)
+ # Convert UTF-16LE buffer to Python string, strip trailing nulls
+ return bytearray(buf).decode('utf-16-le').rstrip('\x00')
+ finally:
+ pdfium_c.FPDFText_ClosePage(textpage)
+
+def draw_text_objects(page, clip_region, output_path):
+ objects = list(page.get_objects())
+ scale = 2.0
+ bitmap = page.render(scale=scale)
+ pil_image = bitmap.to_pil()
+ page_width, page_height = pil_image.size
+ boxes_image = Image.new('RGB', (page_width, page_height), 'white')
+ boxes_draw = ImageDraw.Draw(boxes_image)
+
+ found = False
+ for i, obj in enumerate(objects):
+ if obj.type == 1: # 1 = text object
+ text = get_text_from_raw_text_obj(obj.raw, page)
+ if text and text.strip() == "30075":
+ found = True
+ print(f"Found text '30075' in object {i+1}")
+ # Get and draw object bounding box (red)
+ left = ctypes.c_float()
+ bottom = ctypes.c_float()
+ right = ctypes.c_float()
+ top = ctypes.c_float()
+ success = pdfium_c.FPDFPageObj_GetBounds(obj.raw, ctypes.byref(left), ctypes.byref(bottom), ctypes.byref(right), ctypes.byref(top))
+ if success:
+ print(f"Object bounding box: left={left.value}, bottom={bottom.value}, right={right.value}, top={top.value}")
+ pil_top = page_height - (top.value * scale)
+ pil_bottom = page_height - (bottom.value * scale)
+ pil_left = left.value * scale
+ pil_right = right.value * scale
+ boxes_draw.rectangle([pil_left, pil_top, pil_right, pil_bottom], outline='red', width=3)
+ else:
+ print("Could not get object bounding box.")
+ # Try to get and draw actual clipping path (blue)
+ try:
+ clip_path = pdfium_c.FPDFPageObj_GetClipPath(obj.raw)
+ if clip_path:
+ if all(hasattr(pdfium_c, fn) for fn in [
+ "FPDFClipPath_CountPaths", "FPDFClipPath_CountPathSegments", "FPDFClipPath_GetPathSegment", "FPDFPathSegment_GetPoint", "FPDFPathSegment_GetType", "FPDFPathSegment_GetClose"]):
+ num_paths = pdfium_c.FPDFClipPath_CountPaths(clip_path)
+ for path_idx in range(num_paths):
+ num_segs = pdfium_c.FPDFClipPath_CountPathSegments(clip_path, path_idx)
+ points = []
+ for seg_idx in range(num_segs):
+ seg = pdfium_c.FPDFClipPath_GetPathSegment(clip_path, path_idx, seg_idx)
+ x = ctypes.c_float()
+ y = ctypes.c_float()
+ pdfium_c.FPDFPathSegment_GetPoint(seg, ctypes.byref(x), ctypes.byref(y))
+ pil_x = x.value * scale
+ pil_y = page_height - (y.value * scale)
+ points.append((pil_x, pil_y))
+ # Log the raw PDF coordinates and the PIL coordinates
+ print(f"Object {i+1}, path {path_idx}, seg {seg_idx}: PDF ({x.value}, {y.value}) -> PIL ({pil_x}, {pil_y})")
+ # Check if path is closed
+ closed = False
+ if num_segs > 0:
+ last_seg = pdfium_c.FPDFClipPath_GetPathSegment(clip_path, path_idx, num_segs-1)
+ closed = bool(pdfium_c.FPDFPathSegment_GetClose(last_seg))
+ if len(points) > 1:
+ if closed:
+ boxes_draw.polygon(points, outline='blue')
+ else:
+ boxes_draw.line(points, fill='blue', width=3)
+ else:
+ print("Clipping path exists, but path segment functions are not available in this pypdfium2 version.")
+ else:
+ print("No clipping path for this object.")
+ except Exception as e:
+ print(f"No clipping path or error: {e}")
+ if not found:
+ print("No text object with text '30075' found.")
+
+ boxes_output_path = output_path.replace('.png', '_boxes.png')
+ boxes_image.save(boxes_output_path)
+ print(f"Boxes-only visualization saved to {boxes_output_path}")
+
+def main():
+ parser = argparse.ArgumentParser(description='Visualize PDF text objects with clipping')
+ parser.add_argument('pdf_path', help='Path to the PDF file')
+ parser.add_argument('--page', type=int, default=0, help='Page number (0-based)')
+ args = parser.parse_args()
+ output_dir = "test_output"
+ os.makedirs(output_dir, exist_ok=True)
+ doc = pdfium.PdfDocument(args.pdf_path)
+ page = doc[args.page]
+ output_path = os.path.join(output_dir, f"page_{args.page}.png")
+ draw_text_objects(page, None, output_path)
+
+if __name__ == '__main__':
+ main()
diff --git a/visualize_matching_boxes.py b/visualize_matching_boxes.py
new file mode 100644
index 000000000..2062fb96d
--- /dev/null
+++ b/visualize_matching_boxes.py
@@ -0,0 +1,207 @@
+import argparse
+import pypdfium2 as pdfium
+import pypdfium2.raw as pdfium_c
+import ctypes
+from PIL import Image, ImageDraw
+
+def get_text_from_raw_text_obj(raw_text_obj, page):
+ textpage = pdfium_c.FPDFText_LoadPage(page.raw)
+ if not textpage:
+ return ""
+ try:
+ buflen = pdfium_c.FPDFTextObj_GetText(raw_text_obj, textpage, None, 0)
+ if buflen <= 0:
+ return ""
+ buf = (ctypes.c_ushort * buflen)()
+ pdfium_c.FPDFTextObj_GetText(raw_text_obj, textpage, buf, buflen)
+ return bytearray(buf).decode('utf-16-le').rstrip('\x00')
+ finally:
+ pdfium_c.FPDFText_ClosePage(textpage)
+
+def boxes_intersect(box1, box2):
+ """Check if two boxes intersect using the algorithm from fz_glyph_entirely_outside_box.
+
+ Args:
+ box1, box2: Tuples of (x0, y0, x1, y1) where (x0,y0) is bottom-left, (x1,y1) is top-right
+
+ Returns:
+ True if boxes intersect, False if they are entirely separate
+ """
+ # If box1 is entirely outside box2, they don't intersect
+ if (box1[2] <= box2[0] or # box1.x1 <= box2.x0 (box1 right edge <= box2 left edge)
+ box1[3] <= box2[1] or # box1.y1 <= box2.y0 (box1 top edge <= box2 bottom edge)
+ box1[0] >= box2[2] or # box1.x0 >= box2.x1 (box1 left edge >= box2 right edge)
+ box1[1] >= box2[3]): # box1.y0 >= box2.y1 (box1 bottom edge >= box2 top edge)
+ return False
+ return True
+
+def draw_box(box, boxes_draw, scale, page_height, color='gray', width=1):
+ """Draw a bounding box on the image.
+
+ Args:
+ box: Tuple of (x0, y0, x1, y1) coordinates in PDF space
+ boxes_draw: ImageDraw object to draw on
+ scale: Scale factor for coordinate conversion
+ page_height: Height of the page in pixels
+ color: Color of the outline
+ width: Width of the outline
+ """
+ pil_top = page_height - (box[3] * scale)
+ pil_bottom = page_height - (box[1] * scale)
+ pil_left = box[0] * scale
+ pil_right = box[2] * scale
+ boxes_draw.rectangle([pil_left, pil_top, pil_right, pil_bottom], outline=color, width=width)
+
+def draw_matching_boxes(page, output_path):
+ objects = list(page.get_objects())
+ scale = 2.0
+ bitmap = page.render(scale=scale)
+ pil_image = bitmap.to_pil()
+ page_width, page_height = pil_image.size
+ boxes_draw = ImageDraw.Draw(pil_image)
+
+ # Check for required clip path functions once
+ required_clip_fns = [
+ "FPDFClipPath_CountPaths", "FPDFClipPath_CountPathSegments",
+ "FPDFClipPath_GetPathSegment", "FPDFPathSegment_GetPoint",
+ "FPDFPathSegment_GetType", "FPDFPathSegment_GetClose"
+ ]
+ has_clip_path_api = all(hasattr(pdfium_c, fn) for fn in required_clip_fns)
+ if not has_clip_path_api:
+ raise RuntimeError("Required PDFium clip path API functions are missing in pdfium_c. Please check your PDFium installation.")
+
+ # Draw the page crop box in green
+ try:
+ left = ctypes.c_float()
+ bottom = ctypes.c_float()
+ right = ctypes.c_float()
+ top = ctypes.c_float()
+ success = pdfium_c.FPDFPage_GetCropBox(page.raw, ctypes.byref(left), ctypes.byref(bottom), ctypes.byref(right), ctypes.byref(top))
+ if success:
+ crop_box = (left.value, bottom.value, right.value, top.value)
+ print(f"Page crop box: ({crop_box[0]:.2f}, {crop_box[1]:.2f}, {crop_box[2]:.2f}, {crop_box[3]:.2f})")
+ draw_box(crop_box, boxes_draw, scale, page_height, color='green', width=2)
+ else:
+ print("Could not get page crop box")
+ except Exception as e:
+ print(f"Error getting page crop box: {e}")
+
+ # Initialize counters for statistics
+ total_objects = len(objects)
+ text_objects = 0
+ visible_objects = 0
+ clipped_objects = 0
+
+ # Load textpage once for efficiency
+ textpage = pdfium_c.FPDFText_LoadPage(page.raw)
+ if not textpage:
+ print("Warning: Could not load textpage for text checking")
+
+ for i, obj in enumerate(objects):
+ # Check if object is a text object and has text content
+ obj_type = pdfium_c.FPDFPageObj_GetType(obj.raw)
+ if obj_type != 1: # FPDF_PAGEOBJ_TEXT = 1
+ continue
+
+ # Check if text object has any text content (without extracting it)
+ if textpage:
+ buflen = pdfium_c.FPDFTextObj_GetText(obj.raw, textpage, None, 0)
+ if buflen <= 0: # No text content
+ continue
+
+ text_objects += 1
+
+ # Get object bounding box
+ left = ctypes.c_float()
+ bottom = ctypes.c_float()
+ right = ctypes.c_float()
+ top = ctypes.c_float()
+ success = pdfium_c.FPDFPageObj_GetBounds(obj.raw, ctypes.byref(left), ctypes.byref(bottom), ctypes.byref(right), ctypes.byref(top))
+ if not success:
+ print(f"Object {i+1}: Could not get object bounding box.")
+ continue
+ obj_box = (left.value, bottom.value, right.value, top.value)
+ #print(f"Object {i+1}: Bounding box: ({obj_box[0]:.2f}, {obj_box[1]:.2f}, {obj_box[2]:.2f}, {obj_box[3]:.2f})")
+ # Try to get clipping path
+ show_box = True
+ try:
+ clip_path = pdfium_c.FPDFPageObj_GetClipPath(obj.raw)
+ if clip_path:
+ # Collect all points from all paths to calculate a bounding rectangle
+ all_points = []
+ num_paths = pdfium_c.FPDFClipPath_CountPaths(clip_path)
+ for path_idx in range(num_paths):
+ num_segs = pdfium_c.FPDFClipPath_CountPathSegments(clip_path, path_idx)
+ for seg_idx in range(num_segs):
+ seg = pdfium_c.FPDFClipPath_GetPathSegment(clip_path, path_idx, seg_idx)
+ x = ctypes.c_float()
+ y = ctypes.c_float()
+ pdfium_c.FPDFPathSegment_GetPoint(seg, ctypes.byref(x), ctypes.byref(y))
+ all_points.append((x.value, y.value))
+
+ if all_points:
+ # Calculate the minimal bounding rectangle that fits the clip path
+ xs = [pt[0] for pt in all_points]
+ ys = [pt[1] for pt in all_points]
+ clip_box = (min(xs), min(ys), max(xs), max(ys))
+ #print(f"Object {i+1}: Clip path bounding box: ({clip_box[0]:.2f}, {clip_box[1]:.2f}, {clip_box[2]:.2f}, {clip_box[3]:.2f}), # of paths: {num_paths}, # of segments: {num_segs}")
+
+
+ # Only show the object's bounding box if it doesn't match the clip box
+ if not boxes_intersect(obj_box, clip_box):
+ show_box = False
+ clipped_objects += 1
+ else:
+ # Draw the clip path bounding box in blue
+ draw_box(clip_box, boxes_draw, scale, page_height, color='blue')
+
+ # If boxes intersect, draw in red and extract text
+ draw_box(obj_box, boxes_draw, scale, page_height, color='red', width=3)
+ show_box = True
+ visible_objects += 1
+
+ # Extract and print text for red boxes
+ if textpage:
+ buflen = pdfium_c.FPDFTextObj_GetText(obj.raw, textpage, None, 0)
+ if buflen > 0:
+ buf = (ctypes.c_ushort * buflen)()
+ pdfium_c.FPDFTextObj_GetText(obj.raw, textpage, buf, buflen)
+ byte_buf = bytearray(buf)
+ text_content = byte_buf.decode('utf-16-le').rstrip('\x00')
+ utf16_bytes = text_content.encode('utf-16-le')
+ hex_bytes = ' '.join(f'{b:02x}' for b in utf16_bytes)
+ print(f"Object {i+1} text (red box): '{text_content}', utf16 bytes: {hex_bytes}")
+ else:
+ print(f"Object {i+1}: No clipping path.")
+ except Exception as e:
+ print(f"Object {i+1}: Error getting clip path: {e}")
+ if show_box:
+ draw_box(obj_box, boxes_draw, scale, page_height)
+ visible_objects += 1
+
+ pil_image.save(output_path)
+ print(f"Matching boxes visualization saved to {output_path}")
+
+ # Close textpage to avoid memory leaks
+ if textpage:
+ pdfium_c.FPDFText_ClosePage(textpage)
+
+ # Print statistics
+ print(f"\nStatistics:")
+ print(f"Total objects: {total_objects}")
+ print(f"Text objects: {text_objects}")
+ print(f"Visible objects: {visible_objects}")
+ print(f"Clipped objects: {clipped_objects}")
+
+def main():
+ parser = argparse.ArgumentParser(description='Visualize matching bounding boxes and clip paths for all objects')
+ parser.add_argument('pdf_path', help='Path to the PDF file')
+ parser.add_argument('--page', type=int, default=0, help='Page number (0-based)')
+ parser.add_argument('--output', type=str, default='matching_boxes.png', help='Output image path')
+ args = parser.parse_args()
+ doc = pdfium.PdfDocument(args.pdf_path)
+ page = doc[args.page]
+ draw_matching_boxes(page, args.output)
+
+if __name__ == '__main__':
+ main()
\ No newline at end of file