You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import ImageDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.utils.pdf_remove_watermark import PdfRemoveWatermark
args
img_file_name = "/Users/admin/Desktop/shuomingshu_20.png" # r eplace with the real pdf path
name_without_suff = os.path.basename(img_file_name).split(".")[0]
Description of the bug | 错误描述
左右布局的PDF识别顺序错误
应该是左侧为完整的段落
How to reproduce the bug | 如何复现
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import ImageDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.utils.pdf_remove_watermark import PdfRemoveWatermark
args
img_file_name = "/Users/admin/Desktop/shuomingshu_20.png" # r eplace with the real pdf path
name_without_suff = os.path.basename(img_file_name).split(".")[0]
prepare env
local_image_dir, local_md_dir = "./output/images", "./output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
image_dir = str(os.path.basename(local_image_dir))
read bytes
reader1 = FileBasedDataReader("")
img_bytes = reader1.read(img_file_name) # read the pdf content
pdf_bytes = PdfRemoveWatermark().text_pdf(pdf_file_name)
proc
Create Dataset Instance
ds = ImageDataset(img_bytes)
inference
infer_result = ds.apply(doc_analyze, ocr=True)
pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
draw spans result on each page
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
dump markdown
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
Operating system | 操作系统
MacOS
Python version | Python 版本
3.10
Software version | 软件版本 (magic-pdf --version)
1.0.x
Device mode | 设备模式
cpu
The text was updated successfully, but these errors were encountered: