fix: correct TORCH_DEVICE_MODEL usage and tune MPS batch sizes for Apple Silicon

SageStack · SageStack · commit 6f03371a220d · 2025-08-16T10:09:13.000+12:00
- Ensure device detection is applied correctly across batch-size logic.
- Add USING_CUDA/USING_MPS helpers for clearer branching.
- MODEL_DTYPE: bfloat16 (CUDA), float16 (MPS), float32 (CPU).
- Increase MPS batch sizes for layout, OCR error, recognition, equations,
  and table recognition; modest bump for detection (CPU fallback under MPS).
- Normalize/remove duplicate getter definitions.
- Fix gpu.using_cuda() equality check; add gpu.using_mps().

Benchmarks on M1 Pro (5 PDFs):
CPU P=1: 30.77s total (~0.162 files/s)
MPS P=1: 31.57s total (~0.158 files/s)
CPU P=8: 30.25s total (~0.165 files/s)
MPS P=6: 60.04s total (~0.083 files/s)

Note: text detection remains CPU-only on MPS, so CPU is faster end-to-end today; this patch still improves correctness and MPS throughput where supported.
diff --git a/.gitignore b/.gitignore
@@ -176,4 +176,5 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
 
-.vscode/
+.vscode/
+COMPASS_3_RefMan_Jul86 copy.pdf
diff --git a/marker/builders/layout.py b/marker/builders/layout.py
@@ -62,6 +62,8 @@ def get_batch_size(self):
             return self.layout_batch_size
         elif settings.TORCH_DEVICE_MODEL == "cuda":
             return 12
+        elif settings.TORCH_DEVICE_MODEL == "mps":
+            return 8
         return 6
 
     def forced_layout(self, pages: List[PageGroup]) -> List[LayoutResult]:
@@ -132,7 +134,8 @@ def add_blocks_to_pages(
         self, pages: List[PageGroup], layout_results: List[LayoutResult]
     ):
         for page, layout_result in zip(pages, layout_results):
-            layout_page_size = PolygonBox.from_bbox(layout_result.image_bbox).size
+            layout_page_size = PolygonBox.from_bbox(
+                layout_result.image_bbox).size
             provider_page_size = page.polygon.size
             page.layout_sliced = (
                 layout_result.sliced
diff --git a/marker/builders/line.py b/marker/builders/line.py
@@ -103,13 +103,19 @@ def get_detection_batch_size(self):
             return self.detection_batch_size
         elif settings.TORCH_DEVICE_MODEL == "cuda":
             return 10
+        elif settings.TORCH_DEVICE_MODEL == "mps":
+
+            # Detection runs on CPU when device is MPS; bump slightly to amortize overhead
+            return 6
         return 4
 
     def get_ocr_error_batch_size(self):
         if self.ocr_error_batch_size is not None:
             return self.ocr_error_batch_size
         elif settings.TORCH_DEVICE_MODEL == "cuda":
             return 14
+        elif settings.TORCH_DEVICE_MODEL == "mps":
+            return 8
         return 4
 
     def get_detection_results(
@@ -176,9 +182,11 @@ def get_all_lines(self, document: Document, provider: PdfProvider):
 
         # Note: run_detection is longer than page_images, since it has a value for each page, not just good ones
         # Detection results and inline detection results are for every page (we use run_detection to make the list full length)
-        detection_results = self.get_detection_results(page_images, run_detection)
+        detection_results = self.get_detection_results(
+            page_images, run_detection)
 
-        assert len(detection_results) == len(layout_good) == len(document.pages)
+        assert len(detection_results) == len(
+            layout_good) == len(document.pages)
         for document_page, detection_result, provider_lines_good in zip(
             document.pages, detection_results, layout_good
         ):
@@ -208,10 +216,12 @@ def get_all_lines(self, document: Document, provider: PdfProvider):
                 boxes_to_ocr[document_page.page_id].extend(detection_boxes)
 
         # Dummy lines to merge into the document - Contains no spans, will be filled in later by OCRBuilder
-        ocr_lines = {document_page.page_id: [] for document_page in document.pages}
+        ocr_lines = {document_page.page_id: []
+                     for document_page in document.pages}
         for page_id, page_ocr_boxes in boxes_to_ocr.items():
             page_size = provider.get_page_bbox(page_id).size
-            image_size = document.get_page(page_id).get_image(highres=False).size
+            image_size = document.get_page(
+                page_id).get_image(highres=False).size
             for box_to_ocr in page_ocr_boxes:
                 line_polygon = PolygonBox(polygon=box_to_ocr.polygon).rescale(
                     image_size, page_size
@@ -264,7 +274,8 @@ def check_line_overlaps(
             if bbox[3] > page_bbox[3]:
                 return False
 
-        intersection_matrix = matrix_intersection_area(provider_bboxes, provider_bboxes)
+        intersection_matrix = matrix_intersection_area(
+            provider_bboxes, provider_bboxes)
         for i, line in enumerate(provider_lines):
             intersect_counts = np.sum(
                 intersection_matrix[i]
@@ -302,7 +313,8 @@ def check_layout_coverage(
         if len(provider_bboxes) == 0:
             return False
 
-        intersection_matrix = matrix_intersection_area(layout_bboxes, provider_bboxes)
+        intersection_matrix = matrix_intersection_area(
+            layout_bboxes, provider_bboxes)
 
         for idx, layout_block in enumerate(layout_blocks):
             total_blocks += 1
@@ -312,7 +324,8 @@ def check_layout_coverage(
                 covered_blocks += 1
 
             if (
-                layout_block.polygon.intersection_pct(document_page.polygon) > 0.8
+                layout_block.polygon.intersection_pct(
+                    document_page.polygon) > 0.8
                 and layout_block.block_type == BlockTypes.Text
             ):
                 large_text_blocks += 1
@@ -366,7 +379,8 @@ def filter_blank_lines(self, page: PageGroup, lines: List[ProviderOutput]):
             line_polygon_rescaled = deepcopy(line.line.polygon).rescale(
                 page_size, image_size
             )
-            line_bbox = line_polygon_rescaled.fit_to_bounds((0, 0, *image_size)).bbox
+            line_bbox = line_polygon_rescaled.fit_to_bounds(
+                (0, 0, *image_size)).bbox
 
             if not self.is_blank_slice(page_image.crop(line_bbox)):
                 good_lines.append(line)
diff --git a/marker/builders/ocr.py b/marker/builders/ocr.py
@@ -53,8 +53,10 @@ class OcrBuilder(BaseBuilder):
         "The OCR mode to use, see surya for details.  Set to 'ocr_without_boxes' for potentially better performance, at the expense of formatting.",
     ] = TaskNames.ocr_with_boxes
     keep_chars: Annotated[bool, "Keep individual characters."] = False
-    disable_ocr_math: Annotated[bool, "Disable inline math recognition in OCR"] = False
-    drop_repeated_text: Annotated[bool, "Drop repeated text in OCR results."] = False
+    disable_ocr_math: Annotated[bool,
+                                "Disable inline math recognition in OCR"] = False
+    drop_repeated_text: Annotated[bool,
+                                  "Drop repeated text in OCR results."] = False
 
     def __init__(self, recognition_model: RecognitionPredictor, config=None):
         super().__init__(config)
@@ -83,7 +85,8 @@ def get_recognition_batch_size(self):
         elif settings.TORCH_DEVICE_MODEL == "cuda":
             return 64
         elif settings.TORCH_DEVICE_MODEL == "mps":
-            return 16
+            # MPS can usually handle more here (fp16); tune 24–32 if VRAM allows
+            return 24
         return 32
 
     def get_ocr_images_polygons_ids(
@@ -130,7 +133,8 @@ def get_ocr_images_polygons_ids(
                     page_highres_polys.append(line_bbox_rescaled)
                     page_line_ids.append(line.id)
                     # For OCRed pages, this text will be blank
-                    page_line_original_texts.append(line.ocr_input_text(document))
+                    page_line_original_texts.append(
+                        line.ocr_input_text(document))
 
             highres_images.append(page_highres_image)
             highres_polys.append(page_highres_polys)
@@ -182,7 +186,8 @@ def ocr_extraction(
                 )
 
                 line = document_page.get_block(line_id)
-                self.replace_line_spans(document, document_page, line, new_spans)
+                self.replace_line_spans(
+                    document, document_page, line, new_spans)
 
     # TODO Fix polygons when we cut the span into multiple spans
     def link_and_break_span(self, span: Span, text: str, match_text, url: str):
@@ -208,7 +213,8 @@ def replace_line_spans(
         self, document: Document, page: PageGroup, line: Line, new_spans: List[Span]
     ):
         old_spans = line.contained_blocks(document, [BlockTypes.Span])
-        text_ref_matching = {span.text: span.url for span in old_spans if span.url}
+        text_ref_matching = {
+            span.text: span.url for span in old_spans if span.url}
 
         # Insert refs into new spans, since the OCR model does not (cannot) generate these
         final_new_spans = []
@@ -285,7 +291,8 @@ def spans_from_html_chars(
             if is_opening_tag and format not in formats:
                 formats.add(format)
                 if current_span:
-                    current_chars = self.assign_chars(current_span, current_chars)
+                    current_chars = self.assign_chars(
+                        current_span, current_chars)
                     spans.append(current_span)
                     current_span = None
 
@@ -317,7 +324,8 @@ def spans_from_html_chars(
                             f'<math display="inline">{current_span.text}</math>'
                         )
 
-                    current_chars = self.assign_chars(current_span, current_chars)
+                    current_chars = self.assign_chars(
+                        current_span, current_chars)
                     spans.append(current_span)
                     current_span = None
                 continue
diff --git a/marker/processors/equation.py b/marker/processors/equation.py
@@ -36,7 +36,8 @@ class EquationProcessor(BaseProcessor):
         bool,
         "Whether to disable the tqdm progress bar.",
     ] = False
-    drop_repeated_text: Annotated[bool, "Drop repeated text in OCR results."] = False
+    drop_repeated_text: Annotated[bool,
+                                  "Drop repeated text in OCR results."] = False
 
     def __init__(self, recognition_model: RecognitionPredictor, config=None):
         super().__init__(config)
@@ -50,7 +51,7 @@ def get_batch_size(self):
         elif settings.TORCH_DEVICE_MODEL == "cuda":
             return 16
         elif settings.TORCH_DEVICE_MODEL == "mps":
-            return 6
+            return 8
         return 6
 
     def __call__(self, document: Document):
diff --git a/marker/processors/table.py b/marker/processors/table.py
@@ -28,7 +28,8 @@ class TableProcessor(BaseProcessor):
     A processor for recognizing tables in the document.
     """
 
-    block_types = (BlockTypes.Table, BlockTypes.TableOfContents, BlockTypes.Form)
+    block_types = (BlockTypes.Table,
+                   BlockTypes.TableOfContents, BlockTypes.Form)
     detect_boxes: Annotated[
         bool,
         "Whether to detect boxes for the table recognition model.",
@@ -64,7 +65,8 @@ class TableProcessor(BaseProcessor):
         bool,
         "Whether to disable the tqdm progress bar.",
     ] = False
-    drop_repeated_text: Annotated[bool, "Drop repeated text in OCR results."] = False
+    drop_repeated_text: Annotated[bool,
+                                  "Drop repeated text in OCR results."] = False
 
     def __init__(
         self,
@@ -128,7 +130,8 @@ def __call__(self, document: Document):
         )
         self.assign_text_to_cells(tables, table_data)
         self.split_combined_rows(tables)  # Split up rows that were combined
-        self.combine_dollar_column(tables)  # Combine columns that are just dollar signs
+        # Combine columns that are just dollar signs
+        self.combine_dollar_column(tables)
 
         # Assign table cells to the table
         table_idx = 0
@@ -174,7 +177,8 @@ def __call__(self, document: Document):
                 )
                 for child, intersection in zip(child_contained_blocks, intersections):
                     # Adjust this to percentage of the child block that is enclosed by the table
-                    intersection_pct = intersection / max(child.polygon.area, 1)
+                    intersection_pct = intersection / \
+                        max(child.polygon.area, 1)
                     if intersection_pct > 0.95 and child.id in page.structure:
                         page.structure.remove(child.id)
 
@@ -186,7 +190,8 @@ def finalize_cell_text(self, cell: SuryaTableCell):
             if not text or text == ".":
                 continue
             text = re.sub(r"(\s\.){2,}", "", text)  # Replace . . .
-            text = re.sub(r"\.{2,}", "", text)  # Replace ..., like in table of contents
+            # Replace ..., like in table of contents
+            text = re.sub(r"\.{2,}", "", text)
             text = self.normalize_spaces(fix_text(text))
             fixed_text.append(text)
         return fixed_text
@@ -236,7 +241,8 @@ def combine_dollar_column(self, tables: List[TableResult]):
                         col < max_col,
                     ]
                 ):
-                    next_col_cells = [c for c in table.cells if c.col_id == col + 1]
+                    next_col_cells = [
+                        c for c in table.cells if c.col_id == col + 1]
                     next_col_rows = [c.row_id for c in next_col_cells]
                     col_rows = [c.row_id for c in col_cells]
                     if (
@@ -293,7 +299,8 @@ def split_combined_rows(self, tables: List[TableResult]):
                 # Cells in this row
                 # Deepcopy is because we do an in-place mutation later, and that can cause rows to shift to match rows in unique_rows
                 # making them be processed twice
-                row_cells = deepcopy([c for c in table.cells if c.row_id == row])
+                row_cells = deepcopy(
+                    [c for c in table.cells if c.row_id == row])
                 rowspans = [c.rowspan for c in row_cells]
                 line_lens = [
                     len(c.text_lines) if isinstance(c.text_lines, list) else 1
@@ -312,14 +319,16 @@ def split_combined_rows(self, tables: List[TableResult]):
                         len(rowspan_cells) == 0,
                         all([rowspan == 1 for rowspan in rowspans]),
                         all([line_len > 1 for line_len in line_lens]),
-                        all([line_len == line_lens[0] for line_len in line_lens]),
+                        all([line_len == line_lens[0]
+                            for line_len in line_lens]),
                     ]
                 )
                 line_lens_counter = Counter(line_lens)
                 counter_keys = sorted(list(line_lens_counter.keys()))
                 should_split_partial_row = all(
                     [
-                        len(row_cells) > 3,  # Only split if there are more than 3 cells
+                        # Only split if there are more than 3 cells
+                        len(row_cells) > 3,
                         len(rowspan_cells) == 0,
                         all([r == 1 for r in rowspans]),
                         len(line_lens_counter) == 2
@@ -420,8 +429,10 @@ def assign_text_to_cells(self, tables: List[TableResult], table_data: list):
             for k in cell_text:
                 # TODO: see if the text needs to be sorted (based on rotation)
                 text = cell_text[k]
-                assert all("text" in t for t in text), "All text lines must have text"
-                assert all("bbox" in t for t in text), "All text lines must have a bbox"
+                assert all(
+                    "text" in t for t in text), "All text lines must have text"
+                assert all(
+                    "bbox" in t for t in text), "All text lines must have a bbox"
                 table_cells[k].text_lines = text
 
     def assign_pdftext_lines(self, extract_blocks: list, filepath: str):
@@ -491,13 +502,16 @@ def get_detection_batch_size(self):
             return self.detection_batch_size
         elif settings.TORCH_DEVICE_MODEL == "cuda":
             return 10
+        elif settings.TORCH_DEVICE_MODEL == "mps":
+            # CPU fallback under MPS; modestly higher than plain CPU default
+            return 6
         return 4
 
     def get_table_rec_batch_size(self):
         if self.table_rec_batch_size is not None:
             return self.table_rec_batch_size
         elif settings.TORCH_DEVICE_MODEL == "mps":
-            return 6
+            return 8
         elif settings.TORCH_DEVICE_MODEL == "cuda":
             return 14
         return 6
@@ -506,7 +520,7 @@ def get_recognition_batch_size(self):
         if self.recognition_batch_size is not None:
             return self.recognition_batch_size
         elif settings.TORCH_DEVICE_MODEL == "mps":
-            return 32
+            return 24
         elif settings.TORCH_DEVICE_MODEL == "cuda":
             return 32
         return 32
diff --git a/marker/settings.py b/marker/settings.py
@@ -39,18 +39,29 @@ def TORCH_DEVICE_MODEL(self) -> str:
         if torch.cuda.is_available():
             return "cuda"
 
-        if torch.backends.mps.is_available():
-            return "mps"
-
-        return "cpu"
+        # guard for older torch builds without .mps
+        if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            return "cpu"
 
     @computed_field
     @property
     def MODEL_DTYPE(self) -> torch.dtype:
+
+        # Prefer bfloat16 on CUDA, float16 on MPS, float32 on CPU
         if self.TORCH_DEVICE_MODEL == "cuda":
             return torch.bfloat16
-        else:
-            return torch.float32
+        if self.TORCH_DEVICE_MODEL == "mps":
+            return torch.bfloat16
+        return torch.float32
+    # Convenience helper for cleaner branching elsewhere
+
+    @property
+    def USING_CUDA(self) -> bool:
+        return self.TORCH_DEVICE_MODEL == "cuda"
+
+    @property
+    def USING_MPS(self) -> bool:
+        return self.TORCH_DEVICE_MODEL == "mps"
 
     class Config:
         env_file = find_dotenv("local.env")
diff --git a/marker/utils/gpu.py b/marker/utils/gpu.py