Skip to content

Multi-page document to single page document. #266

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from pymupdf4llm.helpers.get_text_lines import get_raw_lines, is_white
from pymupdf4llm.helpers.multi_column import column_boxes
from pymupdf4llm.helpers.progress import ProgressBar
from pymupdf4llm.helpers.single_long_page import SingleLongPageDocument
from dataclasses import dataclass
from collections import defaultdict

Expand Down Expand Up @@ -321,6 +322,7 @@ def to_markdown(
extract_words=False,
show_progress=False,
use_glyphs=False,
parse_single_long_page=False,
) -> str:
"""Process the document and return the text of the selected pages.

Expand Down Expand Up @@ -1137,9 +1139,14 @@ def get_page_output(
if use_glyphs:
textflags |= mupdf.FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE

if show_progress:
if show_progress & parse_single_long_page == False:
print(f"Processing {FILENAME}...")
pages = ProgressBar(pages)
if parse_single_long_page:
doc = SingleLongPageDocument(doc)
pages = range(doc.page_count)
if show_progress:
print(f"Processing {FILENAME} page 1 of 1 ...")
for pno in pages:
parms = get_page_output(
doc, pno, margins, textflags, FILENAME, IGNORE_IMAGES, IGNORE_GRAPHICS
Expand Down Expand Up @@ -1191,7 +1198,6 @@ def extract_images_on_page_simple(page, parms, image_size_limit):

return img_info


def filter_small_images(page, parms, image_size_limit):
img_info = []
for item in page.get_image_info():
Expand Down
53 changes: 53 additions & 0 deletions pymupdf4llm/pymupdf4llm/helpers/single_long_page.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import pymupdf

class _SingleLongPage:

_doc:pymupdf.Document

def __init__(self, doc:pymupdf.Document, pages_per_row=1):

# Create source document from supplied pages
page_count = doc.page_count
# Get dimensions of the first page as reference
first_page = doc[0]
page_width = first_page.rect.width
page_height = first_page.rect.height

# Calculate rows needed
rows = (page_count + pages_per_row - 1) // pages_per_row # Ceiling division

# Create a new PDF with a single page large enough to hold all pages
# This assumes that all the pages are the same width and height!
self._doc = pymupdf.open()
total_width = page_width * min(pages_per_row, page_count)
total_height = page_height * rows

# note: MuPDF specifically has a limit of about 32,767 × 32,767 points for page size
if total_height >= 32767:
raise ValueError("Page height exceeds maximum of 32,767 points")

# Create a new single page with the calculated dimensions
single_page = self._doc.new_page(width=total_width, height=total_height)

# print(f"dst_page.rect.height: {single_page.rect.height}")

# Copy content from each source page to the appropriate position on the destination page
for i in range(page_count):
row = i // pages_per_row
col = i % pages_per_row

# Calculate position for this page
x = col * page_width
y = row * page_height

# Get source page
src_page = doc[i]

r = pymupdf.Rect(x,y, src_page.rect.width, (y+src_page.rect.height))

# Copy the content
single_page.show_pdf_page(r, doc, i)

def SingleLongPageDocument(doc:pymupdf.Document, pages_per_row=1) -> pymupdf.Document:
slp = _SingleLongPage(doc, pages_per_row)
return slp._doc