netglade · Elisu · May 27, 2025 · May 29, 2025 · May 29, 2025 · May 29, 2025
diff --git a/marker/config/parser.py b/marker/config/parser.py
@@ -60,6 +60,12 @@ def common_options(fn):
             default=False,
             help="Disable image extraction.",
         )(fn)
+        fn = click.option(
+            "--mathpix",
+            is_flag=True,
+            default=False,
+            help="Use Mathpix for equation processing.",
+        )(fn)
 
         # these are options that need a list transformation, i.e splitting/parsing a string
         fn = click.option(
@@ -106,6 +112,8 @@ def generate_config_dict(self) -> Dict[str, any]:
                     config["pdftext_workers"] = 1
                 case "disable_image_extraction":
                     config["extract_images"] = False
+                case "mathpix":
+                    config["use_mathpix"] = v
                 case _:
                     if k in crawler.attr_set:
                         config[k] = v

diff --git a/marker/converters/__init__.py b/marker/converters/__init__.py
@@ -44,7 +44,7 @@ def initialize_processors(self, processor_cls_lst: List[Type[BaseProcessor]]) ->
         processors = []
         for processor_cls in processor_cls_lst:
             processors.append(self.resolve_dependencies(processor_cls))
-
+            
         simple_llm_processors = [p for p in processors if issubclass(type(p), BaseLLMSimpleBlockProcessor)]
         other_processors = [p for p in processors if not issubclass(type(p), BaseLLMSimpleBlockProcessor)]
 

diff --git a/marker/converters/ocr.py b/marker/converters/ocr.py
@@ -8,6 +8,8 @@
 from marker.processors.equation import EquationProcessor
 from marker.providers.registry import provider_from_filepath
 from marker.renderers.ocr_json import OCRJSONRenderer
+from marker.providers.mathpix import MathpixProvider
+from marker.settings import settings
 
 
 class OCRConverter(PdfConverter):
@@ -21,6 +23,12 @@ def __init__(self, *args, **kwargs):
 
         self.config["format_lines"] = True
         self.renderer = OCRJSONRenderer
+
+        # Initialize Mathpix provider
+        self.mathpix_provider = MathpixProvider(
+            app_id=settings.MATHPIX_APP_ID,
+            app_key=settings.MATHPIX_APP_KEY
+        )
 
     def build_document(self, filepath: str):
         provider_cls = provider_from_filepath(filepath)
@@ -32,6 +40,7 @@ def build_document(self, filepath: str):
         provider = provider_cls(filepath, self.config)
         document = document_builder(provider, layout_builder, line_builder, ocr_builder)
 
+        # Initialize processors
         for processor in self.processor_list:
             processor(document)
 

diff --git a/marker/processors/equation.py b/marker/processors/equation.py
@@ -4,21 +4,19 @@
 from bs4 import BeautifulSoup
 
 from ftfy import fix_text, TextFixerConfig
-from surya.recognition import RecognitionPredictor, OCRResult
-
 from marker.processors import BaseProcessor
 from marker.schema import BlockTypes
 from marker.schema.document import Document
 from marker.settings import settings
+from marker.providers.mathpix import MathpixProvider
+from surya.recognition import RecognitionPredictor, OCRResult
 
 MATH_TAG_PATTERN = re.compile(r"<math[^>]*>(.*?)</math>")
 
-
 class EquationProcessor(BaseProcessor):
     """
     A processor for recognizing equations in the document.
     """
-
     block_types: Annotated[
         Tuple[BlockTypes],
         "The block types to process.",
@@ -36,12 +34,25 @@ class EquationProcessor(BaseProcessor):
         bool,
         "Whether to disable the tqdm progress bar.",
     ] = False
+    use_mathpix: Annotated[
+        bool,
+        "Whether to use Mathpix for equation processing.",
+    ] = False
 
     def __init__(self, recognition_model: RecognitionPredictor, config=None):
         super().__init__(config)
-
         self.recognition_model = recognition_model
 
+        if self.use_mathpix == True:
+            if not settings.MATHPIX_APP_ID or not settings.MATHPIX_APP_KEY:
+                raise ValueError("Mathpix API credentials not configured")
+            self.mathpix_provider = MathpixProvider(
+                app_id=settings.MATHPIX_APP_ID,
+                app_key=settings.MATHPIX_APP_KEY
+            )
+            # Add TextInlineMath to block types when Mathpix is enabled
+            self.block_types = (BlockTypes.Equation, BlockTypes.TextInlineMath)
+
     def get_batch_size(self):
         # Set to 1/4th of OCR batch size due to sequence length with tiling
         if self.equation_batch_size is not None:
@@ -80,6 +91,35 @@ def __call__(self, document: Document):
         if total_equation_blocks == 0:
             return
 
+        if self.use_mathpix:
+            self._process_with_mathpix(images, equation_boxes, equation_block_ids, document)
+        else:
+            self._process_with_recognition(images, equation_boxes, equation_block_ids, document)
+
+    def _process_with_mathpix(self, images, equation_boxes, equation_block_ids, document):
+        for page_idx, (page_image, page_boxes, page_block_ids) in enumerate(
+            zip(images, equation_boxes, equation_block_ids)
+        ):
+            for box_idx, (box, block_id) in enumerate(zip(page_boxes, page_block_ids)):
+                # Crop the equation from the page
+                x1, y1, x2, y2 = [int(coord) for coord in box]
+                equation_image = page_image.crop((x1, y1, x2, y2))
+
+                # Process with Mathpix
+                try:
+                    result = self.mathpix_provider.process_equation(equation_image)
+
+                    # Extract LaTeX from the result
+                    latex = result.get('latex_styled', '')
+                    if latex:
+                        # Wrap in math tags
+                        block = document.get_block(block_id)
+                        block.html = self.fix_latex(f'<math display="block">{latex}</math>')
+                except Exception as e:
+                    print(f"Error processing equation {block_id}: {str(e)}")
+                    continue
+
+    def _process_with_recognition(self, images, equation_boxes, equation_block_ids, document):
         predictions = self.get_latex_batched(images, equation_boxes)
         for page_predictions, page_equation_block_ids in zip(
             predictions, equation_block_ids
@@ -138,3 +178,5 @@ def get_latex_batched(
         ]
 
         return equation_predictions
+
+
diff --git a/marker/providers/mathpix.py b/marker/providers/mathpix.py
@@ -0,0 +1,58 @@
+from typing import Optional, Dict, Any
+import requests
+from PIL import Image
+import io
+import base64
+
+class MathpixProvider:
+    def __init__(self, app_id: str, app_key: str):
+        self.app_id = app_id
+        self.app_key = app_key
+        self.api_url = "https://api.mathpix.com/v3/text"
+
+    def _encode_image(self, image: Image.Image) -> str:
+        """Convert PIL Image to base64 string"""
+        buffered = io.BytesIO()
+        image.save(buffered, format="PNG")
+        return base64.b64encode(buffered.getvalue()).decode()
+
+    def process_equation(self, image: Image.Image, options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+        """
+        Process an equation image using Mathpix API
+
+        Args:
+            image: PIL Image containing the equation
+            options: Additional options for Mathpix API
+
+        Returns:
+            Dict containing the processed equation data
+        """
+        if options is None:
+            options = {}
+
+        # Prepare the request
+        headers = {
+            "app_id": self.app_id,
+            "app_key": self.app_key,
+            "Content-Type": "application/json"
+        }
+
+        # Convert image to base64
+        image_data = self._encode_image(image)
+
+        # Prepare request body
+        data = {
+            "src": f"data:image/png;base64,{image_data}",
+            "formats": ["text", "latex_styled"],
+            "data_options": {
+                "include_asciimath": True,
+                "include_latex": True
+            },
+            **options
+        }
+
+        # Make API request
+        response = requests.post(self.api_url, headers=headers, json=data)
+        response.raise_for_status()
+
+        return response.json() 
diff --git a/marker/settings.py b/marker/settings.py
@@ -1,11 +1,13 @@
-from typing import Optional
+from typing import Optional, Literal
 
-from dotenv import find_dotenv
+from dotenv import find_dotenv, load_dotenv
 from pydantic import computed_field
 from pydantic_settings import BaseSettings
 import torch
 import os
 
+# Load environment variables from .env file
+load_dotenv(find_dotenv(".env"))
 
 class Settings(BaseSettings):
     # Paths
@@ -30,6 +32,10 @@ class Settings(BaseSettings):
         None  # Note: MPS device does not work for text detection, and will default to CPU
     )
 
+    # Equation processing settings
+    MATHPIX_APP_ID: str = os.getenv("MATHPIX_APP_ID", "")
+    MATHPIX_APP_KEY: str = os.getenv("MATHPIX_APP_KEY", "")
+
     @computed_field
     @property
     def TORCH_DEVICE_MODEL(self) -> str: