diff --git a/marker/config/parser.py b/marker/config/parser.py index 8e6bd8de0..1af9c2dc2 100644 --- a/marker/config/parser.py +++ b/marker/config/parser.py @@ -60,6 +60,12 @@ def common_options(fn): default=False, help="Disable image extraction.", )(fn) + fn = click.option( + "--mathpix", + is_flag=True, + default=False, + help="Use Mathpix for equation processing.", + )(fn) # these are options that need a list transformation, i.e splitting/parsing a string fn = click.option( @@ -106,6 +112,8 @@ def generate_config_dict(self) -> Dict[str, any]: config["pdftext_workers"] = 1 case "disable_image_extraction": config["extract_images"] = False + case "mathpix": + config["use_mathpix"] = v case _: if k in crawler.attr_set: config[k] = v diff --git a/marker/converters/__init__.py b/marker/converters/__init__.py index 8357a4991..be56fbfe2 100644 --- a/marker/converters/__init__.py +++ b/marker/converters/__init__.py @@ -44,7 +44,7 @@ def initialize_processors(self, processor_cls_lst: List[Type[BaseProcessor]]) -> processors = [] for processor_cls in processor_cls_lst: processors.append(self.resolve_dependencies(processor_cls)) - + simple_llm_processors = [p for p in processors if issubclass(type(p), BaseLLMSimpleBlockProcessor)] other_processors = [p for p in processors if not issubclass(type(p), BaseLLMSimpleBlockProcessor)] diff --git a/marker/converters/ocr.py b/marker/converters/ocr.py index f7e562553..38ee71dd1 100644 --- a/marker/converters/ocr.py +++ b/marker/converters/ocr.py @@ -8,6 +8,8 @@ from marker.processors.equation import EquationProcessor from marker.providers.registry import provider_from_filepath from marker.renderers.ocr_json import OCRJSONRenderer +from marker.providers.mathpix import MathpixProvider +from marker.settings import settings class OCRConverter(PdfConverter): @@ -21,6 +23,12 @@ def __init__(self, *args, **kwargs): self.config["format_lines"] = True self.renderer = OCRJSONRenderer + + # Initialize Mathpix provider + self.mathpix_provider = MathpixProvider( + app_id=settings.MATHPIX_APP_ID, + app_key=settings.MATHPIX_APP_KEY + ) def build_document(self, filepath: str): provider_cls = provider_from_filepath(filepath) @@ -32,6 +40,7 @@ def build_document(self, filepath: str): provider = provider_cls(filepath, self.config) document = document_builder(provider, layout_builder, line_builder, ocr_builder) + # Initialize processors for processor in self.processor_list: processor(document) diff --git a/marker/processors/equation.py b/marker/processors/equation.py index 36124645c..83f71fb86 100644 --- a/marker/processors/equation.py +++ b/marker/processors/equation.py @@ -4,21 +4,19 @@ from bs4 import BeautifulSoup from ftfy import fix_text, TextFixerConfig -from surya.recognition import RecognitionPredictor, OCRResult - from marker.processors import BaseProcessor from marker.schema import BlockTypes from marker.schema.document import Document from marker.settings import settings +from marker.providers.mathpix import MathpixProvider +from surya.recognition import RecognitionPredictor, OCRResult MATH_TAG_PATTERN = re.compile(r"]*>(.*?)") - class EquationProcessor(BaseProcessor): """ A processor for recognizing equations in the document. """ - block_types: Annotated[ Tuple[BlockTypes], "The block types to process.", @@ -36,12 +34,25 @@ class EquationProcessor(BaseProcessor): bool, "Whether to disable the tqdm progress bar.", ] = False + use_mathpix: Annotated[ + bool, + "Whether to use Mathpix for equation processing.", + ] = False def __init__(self, recognition_model: RecognitionPredictor, config=None): super().__init__(config) - self.recognition_model = recognition_model + if self.use_mathpix == True: + if not settings.MATHPIX_APP_ID or not settings.MATHPIX_APP_KEY: + raise ValueError("Mathpix API credentials not configured") + self.mathpix_provider = MathpixProvider( + app_id=settings.MATHPIX_APP_ID, + app_key=settings.MATHPIX_APP_KEY + ) + # Add TextInlineMath to block types when Mathpix is enabled + self.block_types = (BlockTypes.Equation, BlockTypes.TextInlineMath) + def get_batch_size(self): # Set to 1/4th of OCR batch size due to sequence length with tiling if self.equation_batch_size is not None: @@ -80,6 +91,35 @@ def __call__(self, document: Document): if total_equation_blocks == 0: return + if self.use_mathpix: + self._process_with_mathpix(images, equation_boxes, equation_block_ids, document) + else: + self._process_with_recognition(images, equation_boxes, equation_block_ids, document) + + def _process_with_mathpix(self, images, equation_boxes, equation_block_ids, document): + for page_idx, (page_image, page_boxes, page_block_ids) in enumerate( + zip(images, equation_boxes, equation_block_ids) + ): + for box_idx, (box, block_id) in enumerate(zip(page_boxes, page_block_ids)): + # Crop the equation from the page + x1, y1, x2, y2 = [int(coord) for coord in box] + equation_image = page_image.crop((x1, y1, x2, y2)) + + # Process with Mathpix + try: + result = self.mathpix_provider.process_equation(equation_image) + + # Extract LaTeX from the result + latex = result.get('latex_styled', '') + if latex: + # Wrap in math tags + block = document.get_block(block_id) + block.html = self.fix_latex(f'{latex}') + except Exception as e: + print(f"Error processing equation {block_id}: {str(e)}") + continue + + def _process_with_recognition(self, images, equation_boxes, equation_block_ids, document): predictions = self.get_latex_batched(images, equation_boxes) for page_predictions, page_equation_block_ids in zip( predictions, equation_block_ids @@ -138,3 +178,5 @@ def get_latex_batched( ] return equation_predictions + + diff --git a/marker/providers/mathpix.py b/marker/providers/mathpix.py new file mode 100644 index 000000000..daa35bcb2 --- /dev/null +++ b/marker/providers/mathpix.py @@ -0,0 +1,58 @@ +from typing import Optional, Dict, Any +import requests +from PIL import Image +import io +import base64 + +class MathpixProvider: + def __init__(self, app_id: str, app_key: str): + self.app_id = app_id + self.app_key = app_key + self.api_url = "https://api.mathpix.com/v3/text" + + def _encode_image(self, image: Image.Image) -> str: + """Convert PIL Image to base64 string""" + buffered = io.BytesIO() + image.save(buffered, format="PNG") + return base64.b64encode(buffered.getvalue()).decode() + + def process_equation(self, image: Image.Image, options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """ + Process an equation image using Mathpix API + + Args: + image: PIL Image containing the equation + options: Additional options for Mathpix API + + Returns: + Dict containing the processed equation data + """ + if options is None: + options = {} + + # Prepare the request + headers = { + "app_id": self.app_id, + "app_key": self.app_key, + "Content-Type": "application/json" + } + + # Convert image to base64 + image_data = self._encode_image(image) + + # Prepare request body + data = { + "src": f"data:image/png;base64,{image_data}", + "formats": ["text", "latex_styled"], + "data_options": { + "include_asciimath": True, + "include_latex": True + }, + **options + } + + # Make API request + response = requests.post(self.api_url, headers=headers, json=data) + response.raise_for_status() + + return response.json() \ No newline at end of file diff --git a/marker/settings.py b/marker/settings.py index 5660ada90..d5d606150 100644 --- a/marker/settings.py +++ b/marker/settings.py @@ -1,11 +1,13 @@ -from typing import Optional +from typing import Optional, Literal -from dotenv import find_dotenv +from dotenv import find_dotenv, load_dotenv from pydantic import computed_field from pydantic_settings import BaseSettings import torch import os +# Load environment variables from .env file +load_dotenv(find_dotenv(".env")) class Settings(BaseSettings): # Paths @@ -30,6 +32,10 @@ class Settings(BaseSettings): None # Note: MPS device does not work for text detection, and will default to CPU ) + # Equation processing settings + MATHPIX_APP_ID: str = os.getenv("MATHPIX_APP_ID", "") + MATHPIX_APP_KEY: str = os.getenv("MATHPIX_APP_KEY", "") + @computed_field @property def TORCH_DEVICE_MODEL(self) -> str: diff --git a/poetry.lock b/poetry.lock index 90b66ece0..29216101b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -3366,10 +3366,10 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\""}, - {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, + {version = ">=1.23.5", markers = "python_version == \"3.11\""}, + {version = ">=1.21.4", markers = "python_version == \"3.10\" and platform_system == \"Darwin\""}, + {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version == \"3.10\""}, ] [[package]] @@ -3466,9 +3466,9 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.22.4", markers = "python_version < \"3.11\""}, - {version = ">=1.23.2", markers = "python_version == \"3.11\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, + {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.22.4", markers = "python_version < \"3.11\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -3532,18 +3532,20 @@ name = "pdftext" version = "0.6.2" description = "Extract structured text from pdfs quickly" optional = false -python-versions = "<4.0,>=3.10" +python-versions = "^3.10" groups = ["main"] -files = [ - {file = "pdftext-0.6.2-py3-none-any.whl", hash = "sha256:905d11e62d548e307933c25865a69c8e993947bb5b40b1535b0a2aa8f07a71d4"}, - {file = "pdftext-0.6.2.tar.gz", hash = "sha256:ff5b92462ac03ae63a23429384ae123d45c162dcda30e7bf2c5c92a6b208c9de"}, -] +files = [] +develop = true [package.dependencies] -click = ">=8.1.8,<9.0.0" -pydantic = ">=2.7.1,<3.0.0" -pydantic-settings = ">=2.2.1,<3.0.0" -pypdfium2 = "4.30.0" +click = "^8.1.8" +pydantic = "^2.7.1" +pydantic-settings = "^2.2.1" +pypdfium2 = "=4.30.1" + +[package.source] +type = "directory" +url = "../pdftext" [[package]] name = "pexpect" @@ -4264,25 +4266,25 @@ windows-terminal = ["colorama (>=0.4.6)"] [[package]] name = "pypdfium2" -version = "4.30.0" +version = "4.30.1" description = "Python bindings to PDFium" optional = false python-versions = ">=3.6" groups = ["main"] files = [ - {file = "pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab"}, - {file = "pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de"}, - {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854"}, - {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2"}, - {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad"}, - {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f"}, - {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163"}, - {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e"}, - {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be"}, - {file = "pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e"}, - {file = "pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c"}, - {file = "pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29"}, - {file = "pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16"}, + {file = "pypdfium2-4.30.1-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:e07c47633732cc18d890bb7e965ad28a9c5a932e548acb928596f86be2e5ae37"}, + {file = "pypdfium2-4.30.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:5ea2d44e96d361123b67b00f527017aa9c847c871b5714e013c01c3eb36a79fe"}, + {file = "pypdfium2-4.30.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1de7a3a36803171b3f66911131046d65a732f9e7834438191cb58235e6163c4e"}, + {file = "pypdfium2-4.30.1-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b8a4231efb13170354f568c722d6540b8d5b476b08825586d48ef70c40d16e03"}, + {file = "pypdfium2-4.30.1-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f434a4934e8244aa95343ffcf24e9ad9f120dbb4785f631bb40a88c39292493"}, + {file = "pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f454032a0bc7681900170f67d8711b3942824531e765f91c2f5ce7937f999794"}, + {file = "pypdfium2-4.30.1-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:bbf9130a72370ee9d602e39949b902db669a2a1c24746a91e5586eb829055d9f"}, + {file = "pypdfium2-4.30.1-py3-none-musllinux_1_1_i686.whl", hash = "sha256:5cb52884b1583b96e94fd78542c63bb42e06df5e8f9e52f8f31f5ad5a1e53367"}, + {file = "pypdfium2-4.30.1-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:1a9e372bd4867ff223cc8c338e33fe11055dad12f22885950fc27646cc8d9122"}, + {file = "pypdfium2-4.30.1-py3-none-win32.whl", hash = "sha256:421f1cf205e213e07c1f2934905779547f4f4a2ff2f59dde29da3d511d3fc806"}, + {file = "pypdfium2-4.30.1-py3-none-win_amd64.whl", hash = "sha256:598a7f20264ab5113853cba6d86c4566e4356cad037d7d1f849c8c9021007e05"}, + {file = "pypdfium2-4.30.1-py3-none-win_arm64.whl", hash = "sha256:c2b6d63f6d425d9416c08d2511822b54b8e3ac38e639fc41164b1d75584b3a8c"}, + {file = "pypdfium2-4.30.1.tar.gz", hash = "sha256:5f5c7c6d03598e107d974f66b220a49436aceb191da34cda5f692be098a814ce"}, ] [[package]] @@ -5390,30 +5392,32 @@ snowflake = ["snowflake-connector-python (>=3.3.0) ; python_version < \"3.12\"", [[package]] name = "surya-ocr" -version = "0.14.2" +version = "0.14.5" description = "OCR, layout, reading order, and table recognition in 90+ languages" optional = false -python-versions = "<4.0,>=3.10" +python-versions = "^3.10" groups = ["main"] -files = [ - {file = "surya_ocr-0.14.2-py3-none-any.whl", hash = "sha256:0c402705c860f8bf98fc2bf2a3b49d7f0e16fba587aed6d3f01bb53bb776d283"}, - {file = "surya_ocr-0.14.2.tar.gz", hash = "sha256:852af681073167beba9a638658c70b81318f1a8f3d558db68dead1b2c391e862"}, -] +files = [] +develop = true [package.dependencies] -click = ">=8.1.8,<9.0.0" -einops = ">=0.8.1,<0.9.0" -filetype = ">=1.2.0,<2.0.0" -opencv-python-headless = ">=4.11.0.86,<5.0.0.0" -pillow = ">=10.2.0,<11.0.0" -platformdirs = ">=4.3.6,<5.0.0" -pre-commit = ">=4.2.0,<5.0.0" -pydantic = ">=2.5.3,<3.0.0" -pydantic-settings = ">=2.1.0,<3.0.0" -pypdfium2 = "4.30.0" -python-dotenv = ">=1.0.0,<2.0.0" -torch = ">=2.7.0,<3.0.0" -transformers = ">=4.51.2,<5.0.0" +click = "^8.1.8" +einops = "^0.8.1" +filetype = "^1.2.0" +opencv-python-headless = "^4.11.0.86" +pillow = "^10.2.0" +platformdirs = "^4.3.6" +pre-commit = "^4.2.0" +pydantic = "^2.5.3" +pydantic-settings = "^2.1.0" +pypdfium2 = "=4.30.1" +python-dotenv = "^1.0.0" +torch = "^2.7.0" +transformers = "^4.51.2" + +[package.source] +type = "directory" +url = "../surya" [[package]] name = "sympy" @@ -6505,4 +6509,4 @@ full = ["ebooklib", "mammoth", "openpyxl", "python-pptx", "weasyprint"] [metadata] lock-version = "2.1" python-versions = "^3.10" -content-hash = "c18debb8d18aec4081c31ff32f9dc2bde6f4c0189f1d7647bb6061f685e0e319" +content-hash = "484459202f1148269601972c07e461c46f51f424919485090b707b350ce7fd74" diff --git a/pyproject.toml b/pyproject.toml index 2056da009..a5f82aea4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,9 +26,9 @@ torch = "^2.7.0" tqdm = "^4.66.1" ftfy = "^6.1.1" rapidfuzz = "^3.8.1" -surya-ocr = "^0.14.2" +surya-ocr = {path = "../surya", develop = true} regex = "^2024.4.28" -pdftext = "~0.6.2" +pdftext = {path = "../pdftext", develop = true} markdownify = "^0.13.1" click = "^8.2.0" markdown2 = "^2.5.2" @@ -75,4 +75,4 @@ marker_server = "marker.scripts.server:server_cli" [build-system] requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" \ No newline at end of file +build-backend = "poetry.core.masonry.api" diff --git a/test_clipping.py b/test_clipping.py new file mode 100644 index 000000000..7fc9d65a4 --- /dev/null +++ b/test_clipping.py @@ -0,0 +1,110 @@ +import os +import argparse +import pypdfium2 as pdfium +from PIL import Image, ImageDraw +import pypdfium2.raw as pdfium_c +import ctypes + +def get_text_from_raw_text_obj(raw_text_obj, page): + textpage = pdfium_c.FPDFText_LoadPage(page.raw) + if not textpage: + return "" + try: + buflen = pdfium_c.FPDFTextObj_GetText(raw_text_obj, textpage, None, 0) + if buflen <= 0: + return "" + buf = (ctypes.c_ushort * buflen)() + pdfium_c.FPDFTextObj_GetText(raw_text_obj, textpage, buf, buflen) + # Convert UTF-16LE buffer to Python string, strip trailing nulls + return bytearray(buf).decode('utf-16-le').rstrip('\x00') + finally: + pdfium_c.FPDFText_ClosePage(textpage) + +def draw_text_objects(page, clip_region, output_path): + objects = list(page.get_objects()) + scale = 2.0 + bitmap = page.render(scale=scale) + pil_image = bitmap.to_pil() + page_width, page_height = pil_image.size + boxes_image = Image.new('RGB', (page_width, page_height), 'white') + boxes_draw = ImageDraw.Draw(boxes_image) + + found = False + for i, obj in enumerate(objects): + if obj.type == 1: # 1 = text object + text = get_text_from_raw_text_obj(obj.raw, page) + if text and text.strip() == "30075": + found = True + print(f"Found text '30075' in object {i+1}") + # Get and draw object bounding box (red) + left = ctypes.c_float() + bottom = ctypes.c_float() + right = ctypes.c_float() + top = ctypes.c_float() + success = pdfium_c.FPDFPageObj_GetBounds(obj.raw, ctypes.byref(left), ctypes.byref(bottom), ctypes.byref(right), ctypes.byref(top)) + if success: + print(f"Object bounding box: left={left.value}, bottom={bottom.value}, right={right.value}, top={top.value}") + pil_top = page_height - (top.value * scale) + pil_bottom = page_height - (bottom.value * scale) + pil_left = left.value * scale + pil_right = right.value * scale + boxes_draw.rectangle([pil_left, pil_top, pil_right, pil_bottom], outline='red', width=3) + else: + print("Could not get object bounding box.") + # Try to get and draw actual clipping path (blue) + try: + clip_path = pdfium_c.FPDFPageObj_GetClipPath(obj.raw) + if clip_path: + if all(hasattr(pdfium_c, fn) for fn in [ + "FPDFClipPath_CountPaths", "FPDFClipPath_CountPathSegments", "FPDFClipPath_GetPathSegment", "FPDFPathSegment_GetPoint", "FPDFPathSegment_GetType", "FPDFPathSegment_GetClose"]): + num_paths = pdfium_c.FPDFClipPath_CountPaths(clip_path) + for path_idx in range(num_paths): + num_segs = pdfium_c.FPDFClipPath_CountPathSegments(clip_path, path_idx) + points = [] + for seg_idx in range(num_segs): + seg = pdfium_c.FPDFClipPath_GetPathSegment(clip_path, path_idx, seg_idx) + x = ctypes.c_float() + y = ctypes.c_float() + pdfium_c.FPDFPathSegment_GetPoint(seg, ctypes.byref(x), ctypes.byref(y)) + pil_x = x.value * scale + pil_y = page_height - (y.value * scale) + points.append((pil_x, pil_y)) + # Log the raw PDF coordinates and the PIL coordinates + print(f"Object {i+1}, path {path_idx}, seg {seg_idx}: PDF ({x.value}, {y.value}) -> PIL ({pil_x}, {pil_y})") + # Check if path is closed + closed = False + if num_segs > 0: + last_seg = pdfium_c.FPDFClipPath_GetPathSegment(clip_path, path_idx, num_segs-1) + closed = bool(pdfium_c.FPDFPathSegment_GetClose(last_seg)) + if len(points) > 1: + if closed: + boxes_draw.polygon(points, outline='blue') + else: + boxes_draw.line(points, fill='blue', width=3) + else: + print("Clipping path exists, but path segment functions are not available in this pypdfium2 version.") + else: + print("No clipping path for this object.") + except Exception as e: + print(f"No clipping path or error: {e}") + if not found: + print("No text object with text '30075' found.") + + boxes_output_path = output_path.replace('.png', '_boxes.png') + boxes_image.save(boxes_output_path) + print(f"Boxes-only visualization saved to {boxes_output_path}") + +def main(): + parser = argparse.ArgumentParser(description='Visualize PDF text objects with clipping') + parser.add_argument('pdf_path', help='Path to the PDF file') + parser.add_argument('--page', type=int, default=0, help='Page number (0-based)') + args = parser.parse_args() + output_dir = "test_output" + os.makedirs(output_dir, exist_ok=True) + doc = pdfium.PdfDocument(args.pdf_path) + page = doc[args.page] + output_path = os.path.join(output_dir, f"page_{args.page}.png") + draw_text_objects(page, None, output_path) + +if __name__ == '__main__': + main() diff --git a/visualize_matching_boxes.py b/visualize_matching_boxes.py new file mode 100644 index 000000000..2062fb96d --- /dev/null +++ b/visualize_matching_boxes.py @@ -0,0 +1,207 @@ +import argparse +import pypdfium2 as pdfium +import pypdfium2.raw as pdfium_c +import ctypes +from PIL import Image, ImageDraw + +def get_text_from_raw_text_obj(raw_text_obj, page): + textpage = pdfium_c.FPDFText_LoadPage(page.raw) + if not textpage: + return "" + try: + buflen = pdfium_c.FPDFTextObj_GetText(raw_text_obj, textpage, None, 0) + if buflen <= 0: + return "" + buf = (ctypes.c_ushort * buflen)() + pdfium_c.FPDFTextObj_GetText(raw_text_obj, textpage, buf, buflen) + return bytearray(buf).decode('utf-16-le').rstrip('\x00') + finally: + pdfium_c.FPDFText_ClosePage(textpage) + +def boxes_intersect(box1, box2): + """Check if two boxes intersect using the algorithm from fz_glyph_entirely_outside_box. + + Args: + box1, box2: Tuples of (x0, y0, x1, y1) where (x0,y0) is bottom-left, (x1,y1) is top-right + + Returns: + True if boxes intersect, False if they are entirely separate + """ + # If box1 is entirely outside box2, they don't intersect + if (box1[2] <= box2[0] or # box1.x1 <= box2.x0 (box1 right edge <= box2 left edge) + box1[3] <= box2[1] or # box1.y1 <= box2.y0 (box1 top edge <= box2 bottom edge) + box1[0] >= box2[2] or # box1.x0 >= box2.x1 (box1 left edge >= box2 right edge) + box1[1] >= box2[3]): # box1.y0 >= box2.y1 (box1 bottom edge >= box2 top edge) + return False + return True + +def draw_box(box, boxes_draw, scale, page_height, color='gray', width=1): + """Draw a bounding box on the image. + + Args: + box: Tuple of (x0, y0, x1, y1) coordinates in PDF space + boxes_draw: ImageDraw object to draw on + scale: Scale factor for coordinate conversion + page_height: Height of the page in pixels + color: Color of the outline + width: Width of the outline + """ + pil_top = page_height - (box[3] * scale) + pil_bottom = page_height - (box[1] * scale) + pil_left = box[0] * scale + pil_right = box[2] * scale + boxes_draw.rectangle([pil_left, pil_top, pil_right, pil_bottom], outline=color, width=width) + +def draw_matching_boxes(page, output_path): + objects = list(page.get_objects()) + scale = 2.0 + bitmap = page.render(scale=scale) + pil_image = bitmap.to_pil() + page_width, page_height = pil_image.size + boxes_draw = ImageDraw.Draw(pil_image) + + # Check for required clip path functions once + required_clip_fns = [ + "FPDFClipPath_CountPaths", "FPDFClipPath_CountPathSegments", + "FPDFClipPath_GetPathSegment", "FPDFPathSegment_GetPoint", + "FPDFPathSegment_GetType", "FPDFPathSegment_GetClose" + ] + has_clip_path_api = all(hasattr(pdfium_c, fn) for fn in required_clip_fns) + if not has_clip_path_api: + raise RuntimeError("Required PDFium clip path API functions are missing in pdfium_c. Please check your PDFium installation.") + + # Draw the page crop box in green + try: + left = ctypes.c_float() + bottom = ctypes.c_float() + right = ctypes.c_float() + top = ctypes.c_float() + success = pdfium_c.FPDFPage_GetCropBox(page.raw, ctypes.byref(left), ctypes.byref(bottom), ctypes.byref(right), ctypes.byref(top)) + if success: + crop_box = (left.value, bottom.value, right.value, top.value) + print(f"Page crop box: ({crop_box[0]:.2f}, {crop_box[1]:.2f}, {crop_box[2]:.2f}, {crop_box[3]:.2f})") + draw_box(crop_box, boxes_draw, scale, page_height, color='green', width=2) + else: + print("Could not get page crop box") + except Exception as e: + print(f"Error getting page crop box: {e}") + + # Initialize counters for statistics + total_objects = len(objects) + text_objects = 0 + visible_objects = 0 + clipped_objects = 0 + + # Load textpage once for efficiency + textpage = pdfium_c.FPDFText_LoadPage(page.raw) + if not textpage: + print("Warning: Could not load textpage for text checking") + + for i, obj in enumerate(objects): + # Check if object is a text object and has text content + obj_type = pdfium_c.FPDFPageObj_GetType(obj.raw) + if obj_type != 1: # FPDF_PAGEOBJ_TEXT = 1 + continue + + # Check if text object has any text content (without extracting it) + if textpage: + buflen = pdfium_c.FPDFTextObj_GetText(obj.raw, textpage, None, 0) + if buflen <= 0: # No text content + continue + + text_objects += 1 + + # Get object bounding box + left = ctypes.c_float() + bottom = ctypes.c_float() + right = ctypes.c_float() + top = ctypes.c_float() + success = pdfium_c.FPDFPageObj_GetBounds(obj.raw, ctypes.byref(left), ctypes.byref(bottom), ctypes.byref(right), ctypes.byref(top)) + if not success: + print(f"Object {i+1}: Could not get object bounding box.") + continue + obj_box = (left.value, bottom.value, right.value, top.value) + #print(f"Object {i+1}: Bounding box: ({obj_box[0]:.2f}, {obj_box[1]:.2f}, {obj_box[2]:.2f}, {obj_box[3]:.2f})") + # Try to get clipping path + show_box = True + try: + clip_path = pdfium_c.FPDFPageObj_GetClipPath(obj.raw) + if clip_path: + # Collect all points from all paths to calculate a bounding rectangle + all_points = [] + num_paths = pdfium_c.FPDFClipPath_CountPaths(clip_path) + for path_idx in range(num_paths): + num_segs = pdfium_c.FPDFClipPath_CountPathSegments(clip_path, path_idx) + for seg_idx in range(num_segs): + seg = pdfium_c.FPDFClipPath_GetPathSegment(clip_path, path_idx, seg_idx) + x = ctypes.c_float() + y = ctypes.c_float() + pdfium_c.FPDFPathSegment_GetPoint(seg, ctypes.byref(x), ctypes.byref(y)) + all_points.append((x.value, y.value)) + + if all_points: + # Calculate the minimal bounding rectangle that fits the clip path + xs = [pt[0] for pt in all_points] + ys = [pt[1] for pt in all_points] + clip_box = (min(xs), min(ys), max(xs), max(ys)) + #print(f"Object {i+1}: Clip path bounding box: ({clip_box[0]:.2f}, {clip_box[1]:.2f}, {clip_box[2]:.2f}, {clip_box[3]:.2f}), # of paths: {num_paths}, # of segments: {num_segs}") + + + # Only show the object's bounding box if it doesn't match the clip box + if not boxes_intersect(obj_box, clip_box): + show_box = False + clipped_objects += 1 + else: + # Draw the clip path bounding box in blue + draw_box(clip_box, boxes_draw, scale, page_height, color='blue') + + # If boxes intersect, draw in red and extract text + draw_box(obj_box, boxes_draw, scale, page_height, color='red', width=3) + show_box = True + visible_objects += 1 + + # Extract and print text for red boxes + if textpage: + buflen = pdfium_c.FPDFTextObj_GetText(obj.raw, textpage, None, 0) + if buflen > 0: + buf = (ctypes.c_ushort * buflen)() + pdfium_c.FPDFTextObj_GetText(obj.raw, textpage, buf, buflen) + byte_buf = bytearray(buf) + text_content = byte_buf.decode('utf-16-le').rstrip('\x00') + utf16_bytes = text_content.encode('utf-16-le') + hex_bytes = ' '.join(f'{b:02x}' for b in utf16_bytes) + print(f"Object {i+1} text (red box): '{text_content}', utf16 bytes: {hex_bytes}") + else: + print(f"Object {i+1}: No clipping path.") + except Exception as e: + print(f"Object {i+1}: Error getting clip path: {e}") + if show_box: + draw_box(obj_box, boxes_draw, scale, page_height) + visible_objects += 1 + + pil_image.save(output_path) + print(f"Matching boxes visualization saved to {output_path}") + + # Close textpage to avoid memory leaks + if textpage: + pdfium_c.FPDFText_ClosePage(textpage) + + # Print statistics + print(f"\nStatistics:") + print(f"Total objects: {total_objects}") + print(f"Text objects: {text_objects}") + print(f"Visible objects: {visible_objects}") + print(f"Clipped objects: {clipped_objects}") + +def main(): + parser = argparse.ArgumentParser(description='Visualize matching bounding boxes and clip paths for all objects') + parser.add_argument('pdf_path', help='Path to the PDF file') + parser.add_argument('--page', type=int, default=0, help='Page number (0-based)') + parser.add_argument('--output', type=str, default='matching_boxes.png', help='Output image path') + args = parser.parse_args() + doc = pdfium.PdfDocument(args.pdf_path) + page = doc[args.page] + draw_matching_boxes(page, args.output) + +if __name__ == '__main__': + main() \ No newline at end of file