Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions _tmp.html
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<html><body><h1>Test</h1><p>Hello Docling</p></body></html>
75 changes: 61 additions & 14 deletions docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,9 @@
FormatOption,
PdfFormatOption,
)
from docling.pipeline.factory import get_pipeline_factory
from docling.models.factories import get_ocr_factory
from docling.postprocess.factory import get_postprocessor_factory
from docling.pipeline.asr_pipeline import AsrPipeline
from docling.pipeline.vlm_pipeline import VlmPipeline

Expand Down Expand Up @@ -325,6 +327,12 @@ def convert( # noqa: C901
ProcessingPipeline,
typer.Option(..., help="Choose the pipeline to process PDF or image files."),
] = ProcessingPipeline.STANDARD,
pipeline_plugin: Annotated[
Optional[str],
typer.Option(
..., help="Optional external pipeline kind to use (from plugins)",
),
] = None,
vlm_model: Annotated[
VlmModelType,
typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
Expand Down Expand Up @@ -426,6 +434,18 @@ def convert( # noqa: C901
output: Annotated[
Path, typer.Option(..., help="Output directory where results are saved.")
] = Path("."),
postprocess: Annotated[
Optional[str],
typer.Option(
..., help="Optional postprocessor kind to run after conversion (from plugins)",
),
] = None,
postprocess_params: Annotated[
Optional[str],
typer.Option(
..., help="JSON string of parameters passed to the selected postprocessor.",
),
] = None,
verbose: Annotated[
int,
typer.Option(
Expand Down Expand Up @@ -575,7 +595,14 @@ def convert( # noqa: C901

format_options: Dict[InputFormat, FormatOption] = {}

if pipeline == ProcessingPipeline.STANDARD:
if pipeline_plugin is not None:
# Use external pipeline for PDF/IMAGE via plugin factory
pp_factory = get_pipeline_factory(allow_external_plugins=allow_external_plugins)
plugin_options = pp_factory.create_options(kind=pipeline_plugin)
plugin_cls = pp_factory.classes[type(plugin_options)]
pdf_format_option = PdfFormatOption(pipeline_cls=plugin_cls, pipeline_options=plugin_options)
format_options = {InputFormat.PDF: pdf_format_option, InputFormat.IMAGE: pdf_format_option}
elif pipeline == ProcessingPipeline.STANDARD:
pipeline_options = PdfPipelineOptions(
allow_external_plugins=allow_external_plugins,
enable_remote_services=enable_remote_services,
Expand Down Expand Up @@ -716,19 +743,39 @@ def convert( # noqa: C901
input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
)

output.mkdir(parents=True, exist_ok=True)
export_documents(
conv_results,
output_dir=output,
export_json=export_json,
export_html=export_html,
export_html_split_page=export_html_split_page,
show_layout=show_layout,
export_md=export_md,
export_txt=export_txt,
export_doctags=export_doctags,
image_export_mode=image_export_mode,
)
# If a postprocessor is requested, invoke it; otherwise, perform exports
if postprocess is not None:
import json as _json
pp_factory = get_postprocessor_factory(
allow_external_plugins=allow_external_plugins
)
params = {}
if postprocess_params:
try:
params = _json.loads(postprocess_params)
except Exception as err:
err_console.print(
f"[red]Error: Invalid JSON in --postprocess-params: {err}[/red]"
)
raise typer.Abort()
postprocess_options = pp_factory.create_options(kind=postprocess)
postprocessor = pp_factory.create_instance(options=postprocess_options)
# Consume the generator in the postprocessor
postprocessor.process(conv_results, **params)
else:
output.mkdir(parents=True, exist_ok=True)
export_documents(
conv_results,
output_dir=output,
export_json=export_json,
export_html=export_html,
export_html_split_page=export_html_split_page,
show_layout=show_layout,
export_md=export_md,
export_txt=export_txt,
export_doctags=export_doctags,
image_export_mode=image_export_mode,
)

end_time = time.time() - start_time

Expand Down
1 change: 1 addition & 0 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
)
from docling.exceptions import ConversionError
from docling.pipeline.asr_pipeline import AsrPipeline
from docling.postprocess.base_postprocessor import BasePostprocessor
from docling.pipeline.base_pipeline import BasePipeline
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
Expand Down
22 changes: 22 additions & 0 deletions docling/pipeline/factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import logging
from functools import lru_cache

from docling.models.factories.base_factory import BaseFactory
from docling.pipeline.base_pipeline import BasePipeline

logger = logging.getLogger(__name__)


class PipelineFactory(BaseFactory[BasePipeline]):
def __init__(self, *args, **kwargs):
super().__init__("pipelines", *args, **kwargs)


@lru_cache
def get_pipeline_factory(allow_external_plugins: bool = False) -> PipelineFactory:
factory = PipelineFactory()
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
logger.info("Registered pipelines: %r", factory.registered_kind)
return factory


22 changes: 22 additions & 0 deletions docling/postprocess/base_postprocessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import abc
from collections.abc import Iterable
from typing import Any

from docling.datamodel.document import ConversionResult


class BasePostprocessor(abc.ABC):
"""Base interface for post-processing Docling conversion results.

Implementations may index to vector stores, export to custom sinks, etc.
"""

@abc.abstractmethod
def process(self, conv_results: Iterable[ConversionResult], **kwargs: Any) -> Any:
"""Consume an iterator of ConversionResult and perform side-effects.

Returns an optional result object (implementation-defined).
"""
raise NotImplementedError


22 changes: 22 additions & 0 deletions docling/postprocess/factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import logging
from functools import lru_cache

from docling.models.factories.base_factory import BaseFactory
from docling.postprocess.base_postprocessor import BasePostprocessor

logger = logging.getLogger(__name__)


class PostprocessorFactory(BaseFactory[BasePostprocessor]):
def __init__(self, *args, **kwargs):
super().__init__("postprocessors", *args, **kwargs)


@lru_cache
def get_postprocessor_factory(allow_external_plugins: bool = False) -> PostprocessorFactory:
factory = PostprocessorFactory()
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
logger.info("Registered postprocessors: %r", factory.registered_kind)
return factory