diff --git a/marker/config/parser.py b/marker/config/parser.py
index 8e6bd8de0..1af9c2dc2 100644
--- a/marker/config/parser.py
+++ b/marker/config/parser.py
@@ -60,6 +60,12 @@ def common_options(fn):
             default=False,
             help="Disable image extraction.",
         )(fn)
+        fn = click.option(
+            "--mathpix",
+            is_flag=True,
+            default=False,
+            help="Use Mathpix for equation processing.",
+        )(fn)
 
         # these are options that need a list transformation, i.e splitting/parsing a string
         fn = click.option(
@@ -106,6 +112,8 @@ def generate_config_dict(self) -> Dict[str, any]:
                     config["pdftext_workers"] = 1
                 case "disable_image_extraction":
                     config["extract_images"] = False
+                case "mathpix":
+                    config["use_mathpix"] = v
                 case _:
                     if k in crawler.attr_set:
                         config[k] = v
diff --git a/marker/converters/__init__.py b/marker/converters/__init__.py
index 8357a4991..be56fbfe2 100644
--- a/marker/converters/__init__.py
+++ b/marker/converters/__init__.py
@@ -44,7 +44,7 @@ def initialize_processors(self, processor_cls_lst: List[Type[BaseProcessor]]) ->
         processors = []
         for processor_cls in processor_cls_lst:
             processors.append(self.resolve_dependencies(processor_cls))
-
+            
         simple_llm_processors = [p for p in processors if issubclass(type(p), BaseLLMSimpleBlockProcessor)]
         other_processors = [p for p in processors if not issubclass(type(p), BaseLLMSimpleBlockProcessor)]
 
diff --git a/marker/converters/ocr.py b/marker/converters/ocr.py
index f7e562553..38ee71dd1 100644
--- a/marker/converters/ocr.py
+++ b/marker/converters/ocr.py
@@ -8,6 +8,8 @@
 from marker.processors.equation import EquationProcessor
 from marker.providers.registry import provider_from_filepath
 from marker.renderers.ocr_json import OCRJSONRenderer
+from marker.providers.mathpix import MathpixProvider
+from marker.settings import settings
 
 
 class OCRConverter(PdfConverter):
@@ -21,6 +23,12 @@ def __init__(self, *args, **kwargs):
 
         self.config["format_lines"] = True
         self.renderer = OCRJSONRenderer
+        
+        # Initialize Mathpix provider
+        self.mathpix_provider = MathpixProvider(
+            app_id=settings.MATHPIX_APP_ID,
+            app_key=settings.MATHPIX_APP_KEY
+        )
 
     def build_document(self, filepath: str):
         provider_cls = provider_from_filepath(filepath)
@@ -32,6 +40,7 @@ def build_document(self, filepath: str):
         provider = provider_cls(filepath, self.config)
         document = document_builder(provider, layout_builder, line_builder, ocr_builder)
 
+        # Initialize processors
         for processor in self.processor_list:
             processor(document)
 
diff --git a/marker/processors/equation.py b/marker/processors/equation.py
index 36124645c..83f71fb86 100644
--- a/marker/processors/equation.py
+++ b/marker/processors/equation.py
@@ -4,21 +4,19 @@
 from bs4 import BeautifulSoup
 
 from ftfy import fix_text, TextFixerConfig
-from surya.recognition import RecognitionPredictor, OCRResult
-
 from marker.processors import BaseProcessor
 from marker.schema import BlockTypes
 from marker.schema.document import Document
 from marker.settings import settings
+from marker.providers.mathpix import MathpixProvider
+from surya.recognition import RecognitionPredictor, OCRResult
 
 MATH_TAG_PATTERN = re.compile(r"<math[^>]*>(.*?)</math>")
 
-
 class EquationProcessor(BaseProcessor):
     """
     A processor for recognizing equations in the document.
     """
-
     block_types: Annotated[
         Tuple[BlockTypes],
         "The block types to process.",
@@ -36,12 +34,25 @@ class EquationProcessor(BaseProcessor):
         bool,
         "Whether to disable the tqdm progress bar.",
     ] = False
+    use_mathpix: Annotated[
+        bool,
+        "Whether to use Mathpix for equation processing.",
+    ] = False
 
     def __init__(self, recognition_model: RecognitionPredictor, config=None):
         super().__init__(config)
-
         self.recognition_model = recognition_model
 
+        if self.use_mathpix == True:
+            if not settings.MATHPIX_APP_ID or not settings.MATHPIX_APP_KEY:
+                raise ValueError("Mathpix API credentials not configured")
+            self.mathpix_provider = MathpixProvider(
+                app_id=settings.MATHPIX_APP_ID,
+                app_key=settings.MATHPIX_APP_KEY
+            )
+            # Add TextInlineMath to block types when Mathpix is enabled
+            self.block_types = (BlockTypes.Equation, BlockTypes.TextInlineMath)
+
     def get_batch_size(self):
         # Set to 1/4th of OCR batch size due to sequence length with tiling
         if self.equation_batch_size is not None:
@@ -80,6 +91,35 @@ def __call__(self, document: Document):
         if total_equation_blocks == 0:
             return
 
+        if self.use_mathpix:
+            self._process_with_mathpix(images, equation_boxes, equation_block_ids, document)
+        else:
+            self._process_with_recognition(images, equation_boxes, equation_block_ids, document)
+
+    def _process_with_mathpix(self, images, equation_boxes, equation_block_ids, document):
+        for page_idx, (page_image, page_boxes, page_block_ids) in enumerate(
+            zip(images, equation_boxes, equation_block_ids)
+        ):
+            for box_idx, (box, block_id) in enumerate(zip(page_boxes, page_block_ids)):
+                # Crop the equation from the page
+                x1, y1, x2, y2 = [int(coord) for coord in box]
+                equation_image = page_image.crop((x1, y1, x2, y2))
+                
+                # Process with Mathpix
+                try:
+                    result = self.mathpix_provider.process_equation(equation_image)
+                    
+                    # Extract LaTeX from the result
+                    latex = result.get('latex_styled', '')
+                    if latex:
+                        # Wrap in math tags
+                        block = document.get_block(block_id)
+                        block.html = self.fix_latex(f'<math display="block">{latex}</math>')
+                except Exception as e:
+                    print(f"Error processing equation {block_id}: {str(e)}")
+                    continue
+
+    def _process_with_recognition(self, images, equation_boxes, equation_block_ids, document):
         predictions = self.get_latex_batched(images, equation_boxes)
         for page_predictions, page_equation_block_ids in zip(
             predictions, equation_block_ids
@@ -138,3 +178,5 @@ def get_latex_batched(
         ]
 
         return equation_predictions
+
+
diff --git a/marker/providers/mathpix.py b/marker/providers/mathpix.py
new file mode 100644
index 000000000..daa35bcb2
--- /dev/null
+++ b/marker/providers/mathpix.py
@@ -0,0 +1,58 @@
+from typing import Optional, Dict, Any
+import requests
+from PIL import Image
+import io
+import base64
+
+class MathpixProvider:
+    def __init__(self, app_id: str, app_key: str):
+        self.app_id = app_id
+        self.app_key = app_key
+        self.api_url = "https://api.mathpix.com/v3/text"
+        
+    def _encode_image(self, image: Image.Image) -> str:
+        """Convert PIL Image to base64 string"""
+        buffered = io.BytesIO()
+        image.save(buffered, format="PNG")
+        return base64.b64encode(buffered.getvalue()).decode()
+
+    def process_equation(self, image: Image.Image, options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+        """
+        Process an equation image using Mathpix API
+        
+        Args:
+            image: PIL Image containing the equation
+            options: Additional options for Mathpix API
+            
+        Returns:
+            Dict containing the processed equation data
+        """
+        if options is None:
+            options = {}
+            
+        # Prepare the request
+        headers = {
+            "app_id": self.app_id,
+            "app_key": self.app_key,
+            "Content-Type": "application/json"
+        }
+        
+        # Convert image to base64
+        image_data = self._encode_image(image)
+        
+        # Prepare request body
+        data = {
+            "src": f"data:image/png;base64,{image_data}",
+            "formats": ["text", "latex_styled"],
+            "data_options": {
+                "include_asciimath": True,
+                "include_latex": True
+            },
+            **options
+        }
+        
+        # Make API request
+        response = requests.post(self.api_url, headers=headers, json=data)
+        response.raise_for_status()
+        
+        return response.json() 
\ No newline at end of file
diff --git a/marker/settings.py b/marker/settings.py
index 5660ada90..d5d606150 100644
--- a/marker/settings.py
+++ b/marker/settings.py
@@ -1,11 +1,13 @@
-from typing import Optional
+from typing import Optional, Literal
 
-from dotenv import find_dotenv
+from dotenv import find_dotenv, load_dotenv
 from pydantic import computed_field
 from pydantic_settings import BaseSettings
 import torch
 import os
 
+# Load environment variables from .env file
+load_dotenv(find_dotenv(".env"))
 
 class Settings(BaseSettings):
     # Paths
@@ -30,6 +32,10 @@ class Settings(BaseSettings):
         None  # Note: MPS device does not work for text detection, and will default to CPU
     )
 
+    # Equation processing settings
+    MATHPIX_APP_ID: str = os.getenv("MATHPIX_APP_ID", "")
+    MATHPIX_APP_KEY: str = os.getenv("MATHPIX_APP_KEY", "")
+
     @computed_field
     @property
     def TORCH_DEVICE_MODEL(self) -> str:
diff --git a/poetry.lock b/poetry.lock
index 90b66ece0..29216101b 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -3366,10 +3366,10 @@ files = [
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\""},
-    {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\""},
-    {version = ">=1.23.5", markers = "python_version >= \"3.11\""},
     {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
+    {version = ">=1.23.5", markers = "python_version == \"3.11\""},
+    {version = ">=1.21.4", markers = "python_version == \"3.10\" and platform_system == \"Darwin\""},
+    {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version == \"3.10\""},
 ]
 
 [[package]]
@@ -3466,9 +3466,9 @@ files = [
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
-    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
     {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
+    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
+    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
 ]
 python-dateutil = ">=2.8.2"
 pytz = ">=2020.1"
@@ -3532,18 +3532,20 @@ name = "pdftext"
 version = "0.6.2"
 description = "Extract structured text from pdfs quickly"
 optional = false
-python-versions = "<4.0,>=3.10"
+python-versions = "^3.10"
 groups = ["main"]
-files = [
-    {file = "pdftext-0.6.2-py3-none-any.whl", hash = "sha256:905d11e62d548e307933c25865a69c8e993947bb5b40b1535b0a2aa8f07a71d4"},
-    {file = "pdftext-0.6.2.tar.gz", hash = "sha256:ff5b92462ac03ae63a23429384ae123d45c162dcda30e7bf2c5c92a6b208c9de"},
-]
+files = []
+develop = true
 
 [package.dependencies]
-click = ">=8.1.8,<9.0.0"
-pydantic = ">=2.7.1,<3.0.0"
-pydantic-settings = ">=2.2.1,<3.0.0"
-pypdfium2 = "4.30.0"
+click = "^8.1.8"
+pydantic = "^2.7.1"
+pydantic-settings = "^2.2.1"
+pypdfium2 = "=4.30.1"
+
+[package.source]
+type = "directory"
+url = "../pdftext"
 
 [[package]]
 name = "pexpect"
@@ -4264,25 +4266,25 @@ windows-terminal = ["colorama (>=0.4.6)"]
 
 [[package]]
 name = "pypdfium2"
-version = "4.30.0"
+version = "4.30.1"
 description = "Python bindings to PDFium"
 optional = false
 python-versions = ">=3.6"
 groups = ["main"]
 files = [
-    {file = "pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab"},
-    {file = "pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de"},
-    {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854"},
-    {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2"},
-    {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad"},
-    {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f"},
-    {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163"},
-    {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e"},
-    {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be"},
-    {file = "pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e"},
-    {file = "pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c"},
-    {file = "pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29"},
-    {file = "pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16"},
+    {file = "pypdfium2-4.30.1-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:e07c47633732cc18d890bb7e965ad28a9c5a932e548acb928596f86be2e5ae37"},
+    {file = "pypdfium2-4.30.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:5ea2d44e96d361123b67b00f527017aa9c847c871b5714e013c01c3eb36a79fe"},
+    {file = "pypdfium2-4.30.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1de7a3a36803171b3f66911131046d65a732f9e7834438191cb58235e6163c4e"},
+    {file = "pypdfium2-4.30.1-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b8a4231efb13170354f568c722d6540b8d5b476b08825586d48ef70c40d16e03"},
+    {file = "pypdfium2-4.30.1-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f434a4934e8244aa95343ffcf24e9ad9f120dbb4785f631bb40a88c39292493"},
+    {file = "pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f454032a0bc7681900170f67d8711b3942824531e765f91c2f5ce7937f999794"},
+    {file = "pypdfium2-4.30.1-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:bbf9130a72370ee9d602e39949b902db669a2a1c24746a91e5586eb829055d9f"},
+    {file = "pypdfium2-4.30.1-py3-none-musllinux_1_1_i686.whl", hash = "sha256:5cb52884b1583b96e94fd78542c63bb42e06df5e8f9e52f8f31f5ad5a1e53367"},
+    {file = "pypdfium2-4.30.1-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:1a9e372bd4867ff223cc8c338e33fe11055dad12f22885950fc27646cc8d9122"},
+    {file = "pypdfium2-4.30.1-py3-none-win32.whl", hash = "sha256:421f1cf205e213e07c1f2934905779547f4f4a2ff2f59dde29da3d511d3fc806"},
+    {file = "pypdfium2-4.30.1-py3-none-win_amd64.whl", hash = "sha256:598a7f20264ab5113853cba6d86c4566e4356cad037d7d1f849c8c9021007e05"},
+    {file = "pypdfium2-4.30.1-py3-none-win_arm64.whl", hash = "sha256:c2b6d63f6d425d9416c08d2511822b54b8e3ac38e639fc41164b1d75584b3a8c"},
+    {file = "pypdfium2-4.30.1.tar.gz", hash = "sha256:5f5c7c6d03598e107d974f66b220a49436aceb191da34cda5f692be098a814ce"},
 ]
 
 [[package]]
@@ -5390,30 +5392,32 @@ snowflake = ["snowflake-connector-python (>=3.3.0) ; python_version < \"3.12\"",
 
 [[package]]
 name = "surya-ocr"
-version = "0.14.2"
+version = "0.14.5"
 description = "OCR, layout, reading order, and table recognition in 90+ languages"
 optional = false
-python-versions = "<4.0,>=3.10"
+python-versions = "^3.10"
 groups = ["main"]
-files = [
-    {file = "surya_ocr-0.14.2-py3-none-any.whl", hash = "sha256:0c402705c860f8bf98fc2bf2a3b49d7f0e16fba587aed6d3f01bb53bb776d283"},
-    {file = "surya_ocr-0.14.2.tar.gz", hash = "sha256:852af681073167beba9a638658c70b81318f1a8f3d558db68dead1b2c391e862"},
-]
+files = []
+develop = true
 
 [package.dependencies]
-click = ">=8.1.8,<9.0.0"
-einops = ">=0.8.1,<0.9.0"
-filetype = ">=1.2.0,<2.0.0"
-opencv-python-headless = ">=4.11.0.86,<5.0.0.0"
-pillow = ">=10.2.0,<11.0.0"
-platformdirs = ">=4.3.6,<5.0.0"
-pre-commit = ">=4.2.0,<5.0.0"
-pydantic = ">=2.5.3,<3.0.0"
-pydantic-settings = ">=2.1.0,<3.0.0"
-pypdfium2 = "4.30.0"
-python-dotenv = ">=1.0.0,<2.0.0"
-torch = ">=2.7.0,<3.0.0"
-transformers = ">=4.51.2,<5.0.0"
+click = "^8.1.8"
+einops = "^0.8.1"
+filetype = "^1.2.0"
+opencv-python-headless = "^4.11.0.86"
+pillow = "^10.2.0"
+platformdirs = "^4.3.6"
+pre-commit = "^4.2.0"
+pydantic = "^2.5.3"
+pydantic-settings = "^2.1.0"
+pypdfium2 = "=4.30.1"
+python-dotenv = "^1.0.0"
+torch = "^2.7.0"
+transformers = "^4.51.2"
+
+[package.source]
+type = "directory"
+url = "../surya"
 
 [[package]]
 name = "sympy"
@@ -6505,4 +6509,4 @@ full = ["ebooklib", "mammoth", "openpyxl", "python-pptx", "weasyprint"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.10"
-content-hash = "c18debb8d18aec4081c31ff32f9dc2bde6f4c0189f1d7647bb6061f685e0e319"
+content-hash = "484459202f1148269601972c07e461c46f51f424919485090b707b350ce7fd74"
diff --git a/pyproject.toml b/pyproject.toml
index 2056da009..a5f82aea4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,9 +26,9 @@ torch = "^2.7.0"
 tqdm = "^4.66.1"
 ftfy = "^6.1.1"
 rapidfuzz = "^3.8.1"
-surya-ocr = "^0.14.2"
+surya-ocr = {path = "../surya", develop = true}
 regex = "^2024.4.28"
-pdftext = "~0.6.2"
+pdftext = {path = "../pdftext", develop = true}
 markdownify = "^0.13.1"
 click = "^8.2.0"
 markdown2 = "^2.5.2"
@@ -75,4 +75,4 @@ marker_server = "marker.scripts.server:server_cli"
 
 [build-system]
 requires = ["poetry-core"]
-build-backend = "poetry.core.masonry.api"
\ No newline at end of file
+build-backend = "poetry.core.masonry.api"
diff --git a/test_clipping.py b/test_clipping.py
new file mode 100644
index 000000000..7fc9d65a4
--- /dev/null
+++ b/test_clipping.py
@@ -0,0 +1,110 @@
+import os
+import argparse
+import pypdfium2 as pdfium
+from PIL import Image, ImageDraw
+import pypdfium2.raw as pdfium_c
+import ctypes
+
+def get_text_from_raw_text_obj(raw_text_obj, page):
+    textpage = pdfium_c.FPDFText_LoadPage(page.raw)
+    if not textpage:
+        return ""
+    try:
+        buflen = pdfium_c.FPDFTextObj_GetText(raw_text_obj, textpage, None, 0)
+        if buflen <= 0:
+            return ""
+        buf = (ctypes.c_ushort * buflen)()
+        pdfium_c.FPDFTextObj_GetText(raw_text_obj, textpage, buf, buflen)
+        # Convert UTF-16LE buffer to Python string, strip trailing nulls
+        return bytearray(buf).decode('utf-16-le').rstrip('\x00')
+    finally:
+        pdfium_c.FPDFText_ClosePage(textpage)
+
+def draw_text_objects(page, clip_region, output_path):
+    objects = list(page.get_objects())
+    scale = 2.0
+    bitmap = page.render(scale=scale)
+    pil_image = bitmap.to_pil()
+    page_width, page_height = pil_image.size
+    boxes_image = Image.new('RGB', (page_width, page_height), 'white')
+    boxes_draw = ImageDraw.Draw(boxes_image)
+
+    found = False
+    for i, obj in enumerate(objects):
+        if obj.type == 1:  # 1 = text object
+            text = get_text_from_raw_text_obj(obj.raw, page)
+            if text and text.strip() == "30075":
+                found = True
+                print(f"Found text '30075' in object {i+1}")
+                # Get and draw object bounding box (red)
+                left = ctypes.c_float()
+                bottom = ctypes.c_float()
+                right = ctypes.c_float()
+                top = ctypes.c_float()
+                success = pdfium_c.FPDFPageObj_GetBounds(obj.raw, ctypes.byref(left), ctypes.byref(bottom), ctypes.byref(right), ctypes.byref(top))
+                if success:
+                    print(f"Object bounding box: left={left.value}, bottom={bottom.value}, right={right.value}, top={top.value}")
+                    pil_top = page_height - (top.value * scale)
+                    pil_bottom = page_height - (bottom.value * scale)
+                    pil_left = left.value * scale
+                    pil_right = right.value * scale
+                    boxes_draw.rectangle([pil_left, pil_top, pil_right, pil_bottom], outline='red', width=3)
+                else:
+                    print("Could not get object bounding box.")
+                # Try to get and draw actual clipping path (blue)
+                try:
+                    clip_path = pdfium_c.FPDFPageObj_GetClipPath(obj.raw)
+                    if clip_path:
+                        if all(hasattr(pdfium_c, fn) for fn in [
+                            "FPDFClipPath_CountPaths", "FPDFClipPath_CountPathSegments", "FPDFClipPath_GetPathSegment", "FPDFPathSegment_GetPoint", "FPDFPathSegment_GetType", "FPDFPathSegment_GetClose"]):
+                            num_paths = pdfium_c.FPDFClipPath_CountPaths(clip_path)
+                            for path_idx in range(num_paths):
+                                num_segs = pdfium_c.FPDFClipPath_CountPathSegments(clip_path, path_idx)
+                                points = []
+                                for seg_idx in range(num_segs):
+                                    seg = pdfium_c.FPDFClipPath_GetPathSegment(clip_path, path_idx, seg_idx)
+                                    x = ctypes.c_float()
+                                    y = ctypes.c_float()
+                                    pdfium_c.FPDFPathSegment_GetPoint(seg, ctypes.byref(x), ctypes.byref(y))
+                                    pil_x = x.value * scale
+                                    pil_y = page_height - (y.value * scale)
+                                    points.append((pil_x, pil_y))
+                                    # Log the raw PDF coordinates and the PIL coordinates
+                                    print(f"Object {i+1}, path {path_idx}, seg {seg_idx}: PDF ({x.value}, {y.value}) -> PIL ({pil_x}, {pil_y})")
+                                # Check if path is closed
+                                closed = False
+                                if num_segs > 0:
+                                    last_seg = pdfium_c.FPDFClipPath_GetPathSegment(clip_path, path_idx, num_segs-1)
+                                    closed = bool(pdfium_c.FPDFPathSegment_GetClose(last_seg))
+                                if len(points) > 1:
+                                    if closed:
+                                        boxes_draw.polygon(points, outline='blue')
+                                    else:
+                                        boxes_draw.line(points, fill='blue', width=3)
+                        else:
+                            print("Clipping path exists, but path segment functions are not available in this pypdfium2 version.")
+                    else:
+                        print("No clipping path for this object.")
+                except Exception as e:
+                    print(f"No clipping path or error: {e}")
+    if not found:
+        print("No text object with text '30075' found.")
+
+    boxes_output_path = output_path.replace('.png', '_boxes.png')
+    boxes_image.save(boxes_output_path)
+    print(f"Boxes-only visualization saved to {boxes_output_path}")
+
+def main():
+    parser = argparse.ArgumentParser(description='Visualize PDF text objects with clipping')
+    parser.add_argument('pdf_path', help='Path to the PDF file')
+    parser.add_argument('--page', type=int, default=0, help='Page number (0-based)')
+    args = parser.parse_args()
+    output_dir = "test_output"
+    os.makedirs(output_dir, exist_ok=True)
+    doc = pdfium.PdfDocument(args.pdf_path)
+    page = doc[args.page]
+    output_path = os.path.join(output_dir, f"page_{args.page}.png")
+    draw_text_objects(page, None, output_path)
+
+if __name__ == '__main__':
+    main()
diff --git a/visualize_matching_boxes.py b/visualize_matching_boxes.py
new file mode 100644
index 000000000..2062fb96d
--- /dev/null
+++ b/visualize_matching_boxes.py
@@ -0,0 +1,207 @@
+import argparse
+import pypdfium2 as pdfium
+import pypdfium2.raw as pdfium_c
+import ctypes
+from PIL import Image, ImageDraw
+
+def get_text_from_raw_text_obj(raw_text_obj, page):
+    textpage = pdfium_c.FPDFText_LoadPage(page.raw)
+    if not textpage:
+        return ""
+    try:
+        buflen = pdfium_c.FPDFTextObj_GetText(raw_text_obj, textpage, None, 0)
+        if buflen <= 0:
+            return ""
+        buf = (ctypes.c_ushort * buflen)()
+        pdfium_c.FPDFTextObj_GetText(raw_text_obj, textpage, buf, buflen)
+        return bytearray(buf).decode('utf-16-le').rstrip('\x00')
+    finally:
+        pdfium_c.FPDFText_ClosePage(textpage)
+
+def boxes_intersect(box1, box2):
+    """Check if two boxes intersect using the algorithm from fz_glyph_entirely_outside_box.
+    
+    Args:
+        box1, box2: Tuples of (x0, y0, x1, y1) where (x0,y0) is bottom-left, (x1,y1) is top-right
+    
+    Returns:
+        True if boxes intersect, False if they are entirely separate
+    """
+    # If box1 is entirely outside box2, they don't intersect
+    if (box1[2] <= box2[0] or  # box1.x1 <= box2.x0 (box1 right edge <= box2 left edge)
+        box1[3] <= box2[1] or  # box1.y1 <= box2.y0 (box1 top edge <= box2 bottom edge)  
+        box1[0] >= box2[2] or  # box1.x0 >= box2.x1 (box1 left edge >= box2 right edge)
+        box1[1] >= box2[3]):   # box1.y0 >= box2.y1 (box1 bottom edge >= box2 top edge)
+        return False
+    return True
+
+def draw_box(box, boxes_draw, scale, page_height, color='gray', width=1):
+    """Draw a bounding box on the image.
+    
+    Args:
+        box: Tuple of (x0, y0, x1, y1) coordinates in PDF space
+        boxes_draw: ImageDraw object to draw on
+        scale: Scale factor for coordinate conversion
+        page_height: Height of the page in pixels
+        color: Color of the outline
+        width: Width of the outline
+    """
+    pil_top = page_height - (box[3] * scale)
+    pil_bottom = page_height - (box[1] * scale)
+    pil_left = box[0] * scale
+    pil_right = box[2] * scale
+    boxes_draw.rectangle([pil_left, pil_top, pil_right, pil_bottom], outline=color, width=width)
+
+def draw_matching_boxes(page, output_path):
+    objects = list(page.get_objects())
+    scale = 2.0
+    bitmap = page.render(scale=scale)
+    pil_image = bitmap.to_pil()
+    page_width, page_height = pil_image.size
+    boxes_draw = ImageDraw.Draw(pil_image)
+
+    # Check for required clip path functions once
+    required_clip_fns = [
+        "FPDFClipPath_CountPaths", "FPDFClipPath_CountPathSegments",
+        "FPDFClipPath_GetPathSegment", "FPDFPathSegment_GetPoint",
+        "FPDFPathSegment_GetType", "FPDFPathSegment_GetClose"
+    ]
+    has_clip_path_api = all(hasattr(pdfium_c, fn) for fn in required_clip_fns)
+    if not has_clip_path_api:
+        raise RuntimeError("Required PDFium clip path API functions are missing in pdfium_c. Please check your PDFium installation.")
+
+    # Draw the page crop box in green
+    try:
+        left = ctypes.c_float()
+        bottom = ctypes.c_float()
+        right = ctypes.c_float()
+        top = ctypes.c_float()
+        success = pdfium_c.FPDFPage_GetCropBox(page.raw, ctypes.byref(left), ctypes.byref(bottom), ctypes.byref(right), ctypes.byref(top))
+        if success:
+            crop_box = (left.value, bottom.value, right.value, top.value)
+            print(f"Page crop box: ({crop_box[0]:.2f}, {crop_box[1]:.2f}, {crop_box[2]:.2f}, {crop_box[3]:.2f})")
+            draw_box(crop_box, boxes_draw, scale, page_height, color='green', width=2)
+        else:
+            print("Could not get page crop box")
+    except Exception as e:
+        print(f"Error getting page crop box: {e}")
+
+    # Initialize counters for statistics
+    total_objects = len(objects)
+    text_objects = 0
+    visible_objects = 0
+    clipped_objects = 0
+
+    # Load textpage once for efficiency
+    textpage = pdfium_c.FPDFText_LoadPage(page.raw)
+    if not textpage:
+        print("Warning: Could not load textpage for text checking")
+
+    for i, obj in enumerate(objects):
+        # Check if object is a text object and has text content
+        obj_type = pdfium_c.FPDFPageObj_GetType(obj.raw)
+        if obj_type != 1:  # FPDF_PAGEOBJ_TEXT = 1
+            continue
+        
+        # Check if text object has any text content (without extracting it)
+        if textpage:
+            buflen = pdfium_c.FPDFTextObj_GetText(obj.raw, textpage, None, 0)
+            if buflen <= 0:  # No text content
+                continue
+        
+        text_objects += 1
+        
+        # Get object bounding box
+        left = ctypes.c_float()
+        bottom = ctypes.c_float()
+        right = ctypes.c_float()
+        top = ctypes.c_float()
+        success = pdfium_c.FPDFPageObj_GetBounds(obj.raw, ctypes.byref(left), ctypes.byref(bottom), ctypes.byref(right), ctypes.byref(top))
+        if not success:
+            print(f"Object {i+1}: Could not get object bounding box.")
+            continue
+        obj_box = (left.value, bottom.value, right.value, top.value)
+        #print(f"Object {i+1}: Bounding box: ({obj_box[0]:.2f}, {obj_box[1]:.2f}, {obj_box[2]:.2f}, {obj_box[3]:.2f})")
+        # Try to get clipping path
+        show_box = True
+        try:
+            clip_path = pdfium_c.FPDFPageObj_GetClipPath(obj.raw)
+            if clip_path:
+                # Collect all points from all paths to calculate a bounding rectangle
+                all_points = []
+                num_paths = pdfium_c.FPDFClipPath_CountPaths(clip_path)
+                for path_idx in range(num_paths):
+                    num_segs = pdfium_c.FPDFClipPath_CountPathSegments(clip_path, path_idx)
+                    for seg_idx in range(num_segs):
+                        seg = pdfium_c.FPDFClipPath_GetPathSegment(clip_path, path_idx, seg_idx)
+                        x = ctypes.c_float()
+                        y = ctypes.c_float()
+                        pdfium_c.FPDFPathSegment_GetPoint(seg, ctypes.byref(x), ctypes.byref(y))
+                        all_points.append((x.value, y.value))
+                
+                if all_points:
+                    # Calculate the minimal bounding rectangle that fits the clip path
+                    xs = [pt[0] for pt in all_points]
+                    ys = [pt[1] for pt in all_points]
+                    clip_box = (min(xs), min(ys), max(xs), max(ys))
+                    #print(f"Object {i+1}: Clip path bounding box: ({clip_box[0]:.2f}, {clip_box[1]:.2f}, {clip_box[2]:.2f}, {clip_box[3]:.2f}), # of paths: {num_paths}, # of segments: {num_segs}")
+                    
+                    
+                    # Only show the object's bounding box if it doesn't match the clip box
+                    if not boxes_intersect(obj_box, clip_box):
+                        show_box = False
+                        clipped_objects += 1
+                    else:
+                        # Draw the clip path bounding box in blue
+                        draw_box(clip_box, boxes_draw, scale, page_height, color='blue')
+                        
+                        # If boxes intersect, draw in red and extract text
+                        draw_box(obj_box, boxes_draw, scale, page_height, color='red', width=3)
+                        show_box = True
+                        visible_objects += 1
+                        
+                        # Extract and print text for red boxes
+                        if textpage:
+                            buflen = pdfium_c.FPDFTextObj_GetText(obj.raw, textpage, None, 0)
+                            if buflen > 0:
+                                buf = (ctypes.c_ushort * buflen)()
+                                pdfium_c.FPDFTextObj_GetText(obj.raw, textpage, buf, buflen)
+                                byte_buf = bytearray(buf)
+                                text_content = byte_buf.decode('utf-16-le').rstrip('\x00')
+                                utf16_bytes = text_content.encode('utf-16-le')
+                                hex_bytes = ' '.join(f'{b:02x}' for b in utf16_bytes)
+                                print(f"Object {i+1} text (red box): '{text_content}', utf16 bytes: {hex_bytes}")
+            else:
+                print(f"Object {i+1}: No clipping path.")
+        except Exception as e:
+            print(f"Object {i+1}: Error getting clip path: {e}")
+        if show_box:
+            draw_box(obj_box, boxes_draw, scale, page_height)
+            visible_objects += 1
+
+    pil_image.save(output_path)
+    print(f"Matching boxes visualization saved to {output_path}")
+    
+    # Close textpage to avoid memory leaks
+    if textpage:
+        pdfium_c.FPDFText_ClosePage(textpage)
+    
+    # Print statistics
+    print(f"\nStatistics:")
+    print(f"Total objects: {total_objects}")
+    print(f"Text objects: {text_objects}")
+    print(f"Visible objects: {visible_objects}")
+    print(f"Clipped objects: {clipped_objects}")
+
+def main():
+    parser = argparse.ArgumentParser(description='Visualize matching bounding boxes and clip paths for all objects')
+    parser.add_argument('pdf_path', help='Path to the PDF file')
+    parser.add_argument('--page', type=int, default=0, help='Page number (0-based)')
+    parser.add_argument('--output', type=str, default='matching_boxes.png', help='Output image path')
+    args = parser.parse_args()
+    doc = pdfium.PdfDocument(args.pdf_path)
+    page = doc[args.page]
+    draw_matching_boxes(page, args.output)
+
+if __name__ == '__main__':
+    main() 
\ No newline at end of file