Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions marker/config/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,12 @@ def common_options(fn):
default=False,
help="Disable image extraction.",
)(fn)
fn = click.option(
"--mathpix",
is_flag=True,
default=False,
help="Use Mathpix for equation processing.",
)(fn)

# these are options that need a list transformation, i.e splitting/parsing a string
fn = click.option(
Expand Down Expand Up @@ -106,6 +112,8 @@ def generate_config_dict(self) -> Dict[str, any]:
config["pdftext_workers"] = 1
case "disable_image_extraction":
config["extract_images"] = False
case "mathpix":
config["use_mathpix"] = v
case _:
if k in crawler.attr_set:
config[k] = v
Expand Down
2 changes: 1 addition & 1 deletion marker/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def initialize_processors(self, processor_cls_lst: List[Type[BaseProcessor]]) ->
processors = []
for processor_cls in processor_cls_lst:
processors.append(self.resolve_dependencies(processor_cls))

simple_llm_processors = [p for p in processors if issubclass(type(p), BaseLLMSimpleBlockProcessor)]
other_processors = [p for p in processors if not issubclass(type(p), BaseLLMSimpleBlockProcessor)]

Expand Down
9 changes: 9 additions & 0 deletions marker/converters/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from marker.processors.equation import EquationProcessor
from marker.providers.registry import provider_from_filepath
from marker.renderers.ocr_json import OCRJSONRenderer
from marker.providers.mathpix import MathpixProvider
from marker.settings import settings


class OCRConverter(PdfConverter):
Expand All @@ -21,6 +23,12 @@ def __init__(self, *args, **kwargs):

self.config["format_lines"] = True
self.renderer = OCRJSONRenderer

# Initialize Mathpix provider
self.mathpix_provider = MathpixProvider(
app_id=settings.MATHPIX_APP_ID,
app_key=settings.MATHPIX_APP_KEY
)

def build_document(self, filepath: str):
provider_cls = provider_from_filepath(filepath)
Expand All @@ -32,6 +40,7 @@ def build_document(self, filepath: str):
provider = provider_cls(filepath, self.config)
document = document_builder(provider, layout_builder, line_builder, ocr_builder)

# Initialize processors
for processor in self.processor_list:
processor(document)

Expand Down
52 changes: 47 additions & 5 deletions marker/processors/equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,19 @@
from bs4 import BeautifulSoup

from ftfy import fix_text, TextFixerConfig
from surya.recognition import RecognitionPredictor, OCRResult

from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document
from marker.settings import settings
from marker.providers.mathpix import MathpixProvider
from surya.recognition import RecognitionPredictor, OCRResult

MATH_TAG_PATTERN = re.compile(r"<math[^>]*>(.*?)</math>")


class EquationProcessor(BaseProcessor):
"""
A processor for recognizing equations in the document.
"""

block_types: Annotated[
Tuple[BlockTypes],
"The block types to process.",
Expand All @@ -36,12 +34,25 @@ class EquationProcessor(BaseProcessor):
bool,
"Whether to disable the tqdm progress bar.",
] = False
use_mathpix: Annotated[
bool,
"Whether to use Mathpix for equation processing.",
] = False

def __init__(self, recognition_model: RecognitionPredictor, config=None):
super().__init__(config)

self.recognition_model = recognition_model

if self.use_mathpix == True:
if not settings.MATHPIX_APP_ID or not settings.MATHPIX_APP_KEY:
raise ValueError("Mathpix API credentials not configured")
self.mathpix_provider = MathpixProvider(
app_id=settings.MATHPIX_APP_ID,
app_key=settings.MATHPIX_APP_KEY
)
# Add TextInlineMath to block types when Mathpix is enabled
self.block_types = (BlockTypes.Equation, BlockTypes.TextInlineMath)

def get_batch_size(self):
# Set to 1/4th of OCR batch size due to sequence length with tiling
if self.equation_batch_size is not None:
Expand Down Expand Up @@ -80,6 +91,35 @@ def __call__(self, document: Document):
if total_equation_blocks == 0:
return

if self.use_mathpix:
self._process_with_mathpix(images, equation_boxes, equation_block_ids, document)
else:
self._process_with_recognition(images, equation_boxes, equation_block_ids, document)

def _process_with_mathpix(self, images, equation_boxes, equation_block_ids, document):
for page_idx, (page_image, page_boxes, page_block_ids) in enumerate(
zip(images, equation_boxes, equation_block_ids)
):
for box_idx, (box, block_id) in enumerate(zip(page_boxes, page_block_ids)):
# Crop the equation from the page
x1, y1, x2, y2 = [int(coord) for coord in box]
equation_image = page_image.crop((x1, y1, x2, y2))

# Process with Mathpix
try:
result = self.mathpix_provider.process_equation(equation_image)

# Extract LaTeX from the result
latex = result.get('latex_styled', '')
if latex:
# Wrap in math tags
block = document.get_block(block_id)
block.html = self.fix_latex(f'<math display="block">{latex}</math>')
except Exception as e:
print(f"Error processing equation {block_id}: {str(e)}")
continue

def _process_with_recognition(self, images, equation_boxes, equation_block_ids, document):
predictions = self.get_latex_batched(images, equation_boxes)
for page_predictions, page_equation_block_ids in zip(
predictions, equation_block_ids
Expand Down Expand Up @@ -138,3 +178,5 @@ def get_latex_batched(
]

return equation_predictions


58 changes: 58 additions & 0 deletions marker/providers/mathpix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from typing import Optional, Dict, Any
import requests
from PIL import Image
import io
import base64

class MathpixProvider:
def __init__(self, app_id: str, app_key: str):
self.app_id = app_id
self.app_key = app_key
self.api_url = "https://api.mathpix.com/v3/text"

def _encode_image(self, image: Image.Image) -> str:
"""Convert PIL Image to base64 string"""
buffered = io.BytesIO()
image.save(buffered, format="PNG")
return base64.b64encode(buffered.getvalue()).decode()

def process_equation(self, image: Image.Image, options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
"""
Process an equation image using Mathpix API

Args:
image: PIL Image containing the equation
options: Additional options for Mathpix API

Returns:
Dict containing the processed equation data
"""
if options is None:
options = {}

# Prepare the request
headers = {
"app_id": self.app_id,
"app_key": self.app_key,
"Content-Type": "application/json"
}

# Convert image to base64
image_data = self._encode_image(image)

# Prepare request body
data = {
"src": f"data:image/png;base64,{image_data}",
"formats": ["text", "latex_styled"],
"data_options": {
"include_asciimath": True,
"include_latex": True
},
**options
}

# Make API request
response = requests.post(self.api_url, headers=headers, json=data)
response.raise_for_status()

return response.json()
10 changes: 8 additions & 2 deletions marker/settings.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from typing import Optional
from typing import Optional, Literal

from dotenv import find_dotenv
from dotenv import find_dotenv, load_dotenv
from pydantic import computed_field
from pydantic_settings import BaseSettings
import torch
import os

# Load environment variables from .env file
load_dotenv(find_dotenv(".env"))

class Settings(BaseSettings):
# Paths
Expand All @@ -30,6 +32,10 @@ class Settings(BaseSettings):
None # Note: MPS device does not work for text detection, and will default to CPU
)

# Equation processing settings
MATHPIX_APP_ID: str = os.getenv("MATHPIX_APP_ID", "")
MATHPIX_APP_KEY: str = os.getenv("MATHPIX_APP_KEY", "")

@computed_field
@property
def TORCH_DEVICE_MODEL(self) -> str:
Expand Down
Loading