| {header} | ') + html_parts.append('
|---|
| {content} | ') + + html_parts.append('
| Molecular Data Table |
Document conversion failed - no content extracted
" + + # Wrap in proper HTML structure for better PDF conversion + if not processed_html.startswith('{processed_html}" + + print(f"Final HTML for PDF conversion (length: {len(processed_html)}):", flush=True) + print(f"HTML content: {processed_html[:1000]}...", flush=True) + + # Use improved CSS with better heading styling and spacing + simple_css = ''' + @page { + size: A4; + margin: 2cm; + } + body { + font-family: serif; + font-size: 12pt; + line-height: 1.6; + } + h1 { + font-size: 20pt; + font-weight: bold; + margin: 1.5em 0 1em 0; + line-height: 1.3; + } + h2 { + font-size: 16pt; + font-weight: bold; + margin: 1.2em 0 0.8em 0; + line-height: 1.4; + padding: 0.2em 0; + } + h3 { + font-size: 14pt; + font-weight: bold; + margin: 1em 0 0.6em 0; + line-height: 1.3; + } + h4, h5, h6 { + font-size: 13pt; + font-weight: bold; + margin: 0.8em 0 0.4em 0; + line-height: 1.3; + } + p { + margin: 0.5em 0; + font-family: serif; + line-height: 1.6; + } + p:empty { + margin: 0.5em 0; + min-height: 1em; + } + strong { + font-weight: bold; + } + ''' + + try: + print("Starting HTML to PDF conversion...", flush=True) + html_doc = HTML(string=processed_html) + print("HTML document created successfully", flush=True) + + html_doc.write_pdf( + self.temp_pdf_path, + stylesheets=[CSS(string=simple_css)] + ) + print(f"PDF conversion completed: {self.temp_pdf_path}", flush=True) + + # Check if PDF file was created and has content + import os + if os.path.exists(self.temp_pdf_path): + pdf_size = os.path.getsize(self.temp_pdf_path) + print(f"Generated PDF size: {pdf_size} bytes", flush=True) + else: + print("ERROR: PDF file was not created!", flush=True) + + except Exception as e: + print(f"ERROR during HTML to PDF conversion: {e}", flush=True) + import traceback + traceback.print_exc() + + # Create a minimal fallback PDF + fallback_html = f"Original content length: {len(html)} characters
Error: {str(e)}
" + try: + HTML(string=fallback_html).write_pdf(self.temp_pdf_path, stylesheets=[CSS(string=simple_css)]) + print("Created fallback PDF", flush=True) + except Exception as fallback_error: + print(f"Even fallback PDF creation failed: {fallback_error}", flush=True) @staticmethod def _preprocess_base64_images(html_content): @@ -100,3 +242,31 @@ def convert_image(match): return "" # we ditch broken images as that breaks the PDF creation down the line return re.sub(pattern, convert_image, html_content) + + def _normalize_html(self, html): + """Normalize HTML to ensure consistent heading levels and preserve empty lines""" + import re + + # Convert all h2 tags to have consistent styling (force them to be treated equally) + # This helps prevent marker from incorrectly assigning different levels + html = re.sub(r'', html) + html = re.sub(r'
\s*
', '', html) + + # Handle cases where mammoth might create empty paragraphs with just whitespace + html = re.sub(r'
(\s*)
', r'', html) + + # Add proper spacing after headings + html = re.sub(r'', r'', html) + + # Ensure paragraphs have at least some content for proper rendering + html = re.sub(r'
\s*<\/p>', '
', html) + + # Add extra spacing for better readability + html = re.sub(r'', r'\n
', html) + + print(f"Normalized HTML preview: {html[:800]}...", flush=True) + + return html diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index bba058d9..a2643530 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -43,7 +43,7 @@ class PdfProvider(BaseProvider): flatten_pdf: Annotated[ bool, "Whether to flatten the PDF structure.", - ] = True + ] = True # True force_ocr: Annotated[ bool, "Whether to force OCR on the whole document.", @@ -82,6 +82,10 @@ def __init__(self, filepath: str, config=None): self.filepath = filepath + # 如果config中指定了flatten_pdf,则使用config中的值覆盖默认值 + if config and 'flatten_pdf' in config: + self.flatten_pdf = config['flatten_pdf'] + with self.get_doc() as doc: self.page_count = len(doc) self.page_lines: ProviderPageLines = {i: [] for i in range(len(doc))} @@ -110,8 +114,9 @@ def get_doc(self): # Must be called on the parent pdf, before retrieving pages to render correctly if self.flatten_pdf: + print(f"[PdfProvider] flatten_pdf is True, init_forms", flush=True) doc.init_forms() - + print('@@@@doc', doc, dir(doc), flush=True) yield doc finally: if doc: @@ -397,6 +402,7 @@ def _render_image( page = pdf[idx] image = page.render(scale=dpi / 72, draw_annots=False).to_pil() image = image.convert("RGB") + print('@@@@@@@@@@image', image, image.size, flush=True) return image def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]: diff --git a/marker/providers/powerpoint.py b/marker/providers/powerpoint.py index 4c7e6987..8df8bf43 100644 --- a/marker/providers/powerpoint.py +++ b/marker/providers/powerpoint.py @@ -75,9 +75,14 @@ def convert_pptx_to_pdf(self, filepath): # Process shapes in the slide for shape in slide.shapes: # If shape is a group shape, we recursively handle all grouped shapes - if shape.shape_type == MSO_SHAPE_TYPE.GROUP: - html_parts.append(self._handle_group(shape)) - continue + try: + shape_type = shape.shape_type + except Exception as e: + print(traceback.format_exc()) + else: + if shape_type == MSO_SHAPE_TYPE.GROUP: + html_parts.append(self._handle_group(shape)) + continue # If shape is a table if shape.has_table: @@ -85,9 +90,14 @@ def convert_pptx_to_pdf(self, filepath): continue # If shape is a picture - if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: - html_parts.append(self._handle_image(shape)) - continue + try: + shape_type = shape.shape_type + except Exception as e: + print(traceback.format_exc()) + else: + if shape_type == MSO_SHAPE_TYPE.PICTURE: + html_parts.append(self._handle_image(shape)) + continue # If shape has text if hasattr(shape, "text") and shape.text is not None: @@ -115,17 +125,27 @@ def _handle_group(self, group_shape) -> str: group_parts = [] for shape in group_shape.shapes: - if shape.shape_type == MSO_SHAPE_TYPE.GROUP: - group_parts.append(self._handle_group(shape)) - continue + try: + shape_type = shape.shape_type + except Exception as e: + print(traceback.format_exc()) + else: + if shape_type == MSO_SHAPE_TYPE.GROUP: + group_parts.append(self._handle_group(shape)) + continue if shape.has_table: group_parts.append(self._handle_table(shape)) continue - - if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: - group_parts.append(self._handle_image(shape)) - continue + + try: + shape_type = shape.shape_type + except Exception as e: + print(traceback.format_exc()) + else: + if shape_type == MSO_SHAPE_TYPE.PICTURE: + group_parts.append(self._handle_image(shape)) + continue if hasattr(shape, "text"): if shape.has_text_frame: diff --git a/marker/providers/registry.py b/marker/providers/registry.py index 4a8e969e..23aee12c 100644 --- a/marker/providers/registry.py +++ b/marker/providers/registry.py @@ -56,7 +56,21 @@ def provider_from_ext(filepath: str): return PdfProvider -def provider_from_filepath(filepath: str): +def provider_from_filepath(filepath: str, file_type: str='pdf'): + if file_type == 'jpg' or file_type == 'png' or file_type == 'jpeg': + return ImageProvider + elif file_type == 'pdf': + return PdfProvider + elif file_type == 'docx' or file_type == 'doc': + return DocumentProvider + elif file_type == 'pptx' or file_type == 'ppt': + return PowerPointProvider + elif file_type == 'epub': + return EpubProvider + elif file_type == 'html': + return HTMLProvider + + # If file_type is not explicitly handled, fall back to content-based detection if filetype.image_match(filepath) is not None: return ImageProvider if file_match(filepath, load_matchers("pdf")) is not None: diff --git a/marker/renderers/__init__.py b/marker/renderers/__init__.py index 2a8cbe77..a6dccb1e 100644 --- a/marker/renderers/__init__.py +++ b/marker/renderers/__init__.py @@ -13,9 +13,17 @@ from marker.settings import settings from marker.util import assign_config +# Import OSS uploader +from marker.oss_uploader import S3Client +s3_client = S3Client() +S3_AVAILABLE = True + class BaseRenderer: - image_blocks: Annotated[Tuple[BlockTypes, ...], "The block types to consider as images."] = (BlockTypes.Picture, BlockTypes.Figure) + image_blocks: Annotated[ + Tuple[BlockTypes, ...], + "The block types to consider as images." + ] = (BlockTypes.Picture, BlockTypes.Figure, BlockTypes.Molecule, BlockTypes.MoleculeTable) extract_images: Annotated[bool, "Extract images from the document."] = True image_extraction_mode: Annotated[ Literal["lowres", "highres"], @@ -31,15 +39,83 @@ def __call__(self, document): raise NotImplementedError def extract_image(self, document: Document, image_id, to_base64=False): + print(f"🖼️ [DEBUG] BaseRenderer.extract_image() called for {image_id}") + image_block = document.get_block(image_id) + if image_block is None: + print(f"❌ [DEBUG] Image block not found for {image_id}") + return None + + print(f"✅ [DEBUG] Found image block: {type(image_block).__name__} (type: {image_block.block_type})") + cropped = image_block.get_image(document, highres=self.image_extraction_mode == "highres") + + if cropped is None: + print(f"❌ [DEBUG] Failed to get image from block {image_id}") + return None + + print(f"✅ [DEBUG] Got cropped image: {cropped.size}") if to_base64: image_buffer = io.BytesIO() cropped.save(image_buffer, format=settings.OUTPUT_IMAGE_FORMAT) cropped = base64.b64encode(image_buffer.getvalue()).decode(settings.OUTPUT_ENCODING) + print(f"✅ [DEBUG] Converted to base64 (length: {len(cropped)})") + return cropped + def upload_image_to_s3(self, image, image_name, image_type="image", metadata=None): + """ + Upload image to S3 if available, otherwise return None + + Args: + image: PIL Image object + image_name: Original image name + image_type: Type of image (image, molecule, etc.) + metadata: Additional metadata (currently not used in S3 implementation) + + Returns: + S3 upload result dict with 'url' and 'key', or None + """ + print(f"☁️ [DEBUG] upload_image_to_s3() called: {image_name} (type: {image_type})") + + if image is None: + print(f"❌ [DEBUG] Image is None, cannot upload") + return None + + print(f"🖼️ [DEBUG] Image size: {image.size}") + + if S3_AVAILABLE and s3_client: + try: + print(f"✅ [DEBUG] S3 client available, starting upload...") + + # Convert PIL Image to bytes + img_buffer = io.BytesIO() + image.save(img_buffer, format='JPEG', quality=85, optimize=True) + image_data = img_buffer.getvalue() + + print(f"📦 [DEBUG] Image converted to bytes: {len(image_data)} bytes") + + # Upload to S3 + result = s3_client.s3_upload_from_file(image_name, image_data) + + if result and 'url' in result: + print(f"✅ [DEBUG] Image uploaded successfully: {result['url']}") + return result + else: + print(f"❌ [DEBUG] Failed to upload image: {image_name}, result: {result}") + return None + + except Exception as e: + print(f"❌ [DEBUG] Failed to upload image to S3: {e}") + import traceback + traceback.print_exc() + return None + else: + print(f"❌ [DEBUG] S3 not available (S3_AVAILABLE: {S3_AVAILABLE}, s3_client: {s3_client is not None})") + + return None + @staticmethod def merge_consecutive_math(html, tag="math"): if not html: @@ -53,36 +129,42 @@ def merge_consecutive_math(html, tag="math"): @staticmethod def merge_consecutive_tags(html, tag): - if not html: - return html - - def replace_whitespace(match): - whitespace = match.group(1) - if len(whitespace) == 0: - return "" - else: - return " " - - pattern = fr'{tag}>(\s*)<{tag}>' - - while True: - new_merged = re.sub(pattern, replace_whitespace, html) - if new_merged == html: - break - html = new_merged - + pattern = f'{tag}>(\s*)<{tag}>' + html = re.sub(pattern, r'\1', html) return html + def get_page_footer(self, page: any): + try: + for block in page.children: + if block.block_type == BlockTypes.PageFooter: + return block.raw_text(page) + except Exception as e: + print('get_page_footer', e, flush=True) + return '' + + def get_page_header(self, page: any): + try: + for block in page.children: + if block.block_type == BlockTypes.PageHeader: + return block.raw_text(page) + except Exception as e: + print('get_page_header', e, flush=True) + return '' + def generate_page_stats(self, document: Document, document_output): page_stats = [] for page in document.pages: block_counts = Counter([str(block.block_type) for block in page.children]).most_common() block_metadata = page.aggregate_block_metadata() + page_header = self.get_page_header(page) + page_footer = self.get_page_footer(page) page_stats.append({ "page_id": page.page_id, "text_extraction_method": page.text_extraction_method, "block_counts": block_counts, - "block_metadata": block_metadata.model_dump() + "block_metadata": block_metadata.model_dump(), + "page_header": page_header, + "page_footer": page_footer }) return page_stats @@ -113,12 +195,52 @@ def extract_block_html(self, document: Document, block_output: BlockOutput): break if ref_block_id.block_type in self.image_blocks and self.extract_images: - images[ref_block_id] = self.extract_image(document, ref_block_id, to_base64=True) + image = self.extract_image(document, ref_block_id, to_base64=False) + image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}" + + # Try to upload to S3 first + image_block = document.get_block(ref_block_id) + metadata = {} + + # Check if it's a molecule image + if ref_block_id.block_type == BlockTypes.Molecule: + if hasattr(image_block, 'structure_data') and image_block.structure_data: + metadata = image_block.structure_data + s3_result = self.upload_image_to_s3(image, image_name, "molecule", metadata) + else: + s3_result = self.upload_image_to_s3(image, image_name, "image", metadata) + + if s3_result: + # Store S3 URL information + images[ref_block_id] = {"url": s3_result['url'], "type": "s3", "original_name": image_name} + else: + # Fall back to base64 + images[ref_block_id] = self.extract_image(document, ref_block_id, to_base64=True) else: images.update(sub_images) ref.replace_with(BeautifulSoup(content, 'html.parser')) if block_output.id.block_type in self.image_blocks and self.extract_images: - images[block_output.id] = self.extract_image(document, block_output.id, to_base64=True) + image = self.extract_image(document, block_output.id, to_base64=False) + image_name = f"{block_output.id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}" + + # Try to upload to S3 first + image_block = document.get_block(block_output.id) + metadata = {} + + # Check if it's a molecule image + if block_output.id.block_type == BlockTypes.Molecule: + if hasattr(image_block, 'structure_data') and image_block.structure_data: + metadata = image_block.structure_data + s3_result = self.upload_image_to_s3(image, image_name, "molecule", metadata) + else: + s3_result = self.upload_image_to_s3(image, image_name, "image", metadata) + + if s3_result: + # Store S3 URL information + images[block_output.id] = {"url": s3_result['url'], "type": "s3", "original_name": image_name} + else: + # Fall back to base64 + images[block_output.id] = self.extract_image(document, block_output.id, to_base64=True) return str(soup), images diff --git a/marker/renderers/html.py b/marker/renderers/html.py index afe76c2b..170d3d2f 100644 --- a/marker/renderers/html.py +++ b/marker/renderers/html.py @@ -18,6 +18,11 @@ # Suppress DecompressionBombError Image.MAX_IMAGE_PIXELS = None +# Import OSS uploader +from marker.oss_uploader import S3Client +s3_client = S3Client() +S3_AVAILABLE = True + class HTMLOutput(BaseModel): html: str @@ -43,12 +48,48 @@ def extract_image(self, document, image_id): cropped = image_block.get_image(document, highres=self.image_extraction_mode == "highres") return cropped + def upload_image_to_s3(self, image, image_name, image_type="image", metadata=None): + """ + Upload image to S3 if available, otherwise return None + + Args: + image: PIL Image object + image_name: Original image name + image_type: Type of image (image, molecule, etc.) + metadata: Additional metadata (currently not used in S3 implementation) + + Returns: + S3 upload result dict with 'url' and 'key', or None + """ + if S3_AVAILABLE and s3_client: + try: + # Convert PIL Image to bytes + import io + img_buffer = io.BytesIO() + image.save(img_buffer, format='JPEG', quality=85, optimize=True) + image_data = img_buffer.getvalue() + + # Upload to S3 + result = s3_client.s3_upload_from_file(image_name, image_data) + + if result and 'url' in result: + print(f"✅ Image uploaded successfully: {result['url']}") + return result + else: + print(f"❌ Failed to upload image: {image_name}") + return None + + except Exception as e: + print(f"Failed to upload image to S3: {e}") + return None + return None + def extract_html(self, document, document_output, level=0): soup = BeautifulSoup(document_output.html, 'html.parser') - content_refs = soup.find_all('content-ref') ref_block_id = None images = {} + for ref in content_refs: src = ref.get('src') sub_images = {} @@ -64,8 +105,99 @@ def extract_html(self, document, document_output, level=0): if self.extract_images: image = self.extract_image(document, ref_block_id) image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}" - images[image_name] = image - ref.replace_with(BeautifulSoup(f"
{content}
{content}
" if content else f"{img_tag}
" + # replacement_html = f"{img_tag}" + + ref.replace_with(BeautifulSoup(replacement_html, 'html.parser')) else: # This will be the image description if using llm mode, or empty if not ref.replace_with(BeautifulSoup(f"{content}", 'html.parser')) @@ -94,6 +226,7 @@ def extract_html(self, document, document_output, level=0):