diff --git a/_tmp.html b/_tmp.html new file mode 100644 index 000000000..22e27e9a3 --- /dev/null +++ b/_tmp.html @@ -0,0 +1 @@ +
Hello Docling
\ No newline at end of file diff --git a/docling/cli/main.py b/docling/cli/main.py index 82c57efb4..93349e932 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -74,7 +74,9 @@ FormatOption, PdfFormatOption, ) +from docling.pipeline.factory import get_pipeline_factory from docling.models.factories import get_ocr_factory +from docling.postprocess.factory import get_postprocessor_factory from docling.pipeline.asr_pipeline import AsrPipeline from docling.pipeline.vlm_pipeline import VlmPipeline @@ -325,6 +327,12 @@ def convert( # noqa: C901 ProcessingPipeline, typer.Option(..., help="Choose the pipeline to process PDF or image files."), ] = ProcessingPipeline.STANDARD, + pipeline_plugin: Annotated[ + Optional[str], + typer.Option( + ..., help="Optional external pipeline kind to use (from plugins)", + ), + ] = None, vlm_model: Annotated[ VlmModelType, typer.Option(..., help="Choose the VLM model to use with PDF or image files."), @@ -426,6 +434,18 @@ def convert( # noqa: C901 output: Annotated[ Path, typer.Option(..., help="Output directory where results are saved.") ] = Path("."), + postprocess: Annotated[ + Optional[str], + typer.Option( + ..., help="Optional postprocessor kind to run after conversion (from plugins)", + ), + ] = None, + postprocess_params: Annotated[ + Optional[str], + typer.Option( + ..., help="JSON string of parameters passed to the selected postprocessor.", + ), + ] = None, verbose: Annotated[ int, typer.Option( @@ -575,7 +595,14 @@ def convert( # noqa: C901 format_options: Dict[InputFormat, FormatOption] = {} - if pipeline == ProcessingPipeline.STANDARD: + if pipeline_plugin is not None: + # Use external pipeline for PDF/IMAGE via plugin factory + pp_factory = get_pipeline_factory(allow_external_plugins=allow_external_plugins) + plugin_options = pp_factory.create_options(kind=pipeline_plugin) + plugin_cls = pp_factory.classes[type(plugin_options)] + pdf_format_option = PdfFormatOption(pipeline_cls=plugin_cls, pipeline_options=plugin_options) + format_options = {InputFormat.PDF: pdf_format_option, InputFormat.IMAGE: pdf_format_option} + elif pipeline == ProcessingPipeline.STANDARD: pipeline_options = PdfPipelineOptions( allow_external_plugins=allow_external_plugins, enable_remote_services=enable_remote_services, @@ -716,19 +743,39 @@ def convert( # noqa: C901 input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error ) - output.mkdir(parents=True, exist_ok=True) - export_documents( - conv_results, - output_dir=output, - export_json=export_json, - export_html=export_html, - export_html_split_page=export_html_split_page, - show_layout=show_layout, - export_md=export_md, - export_txt=export_txt, - export_doctags=export_doctags, - image_export_mode=image_export_mode, - ) + # If a postprocessor is requested, invoke it; otherwise, perform exports + if postprocess is not None: + import json as _json + pp_factory = get_postprocessor_factory( + allow_external_plugins=allow_external_plugins + ) + params = {} + if postprocess_params: + try: + params = _json.loads(postprocess_params) + except Exception as err: + err_console.print( + f"[red]Error: Invalid JSON in --postprocess-params: {err}[/red]" + ) + raise typer.Abort() + postprocess_options = pp_factory.create_options(kind=postprocess) + postprocessor = pp_factory.create_instance(options=postprocess_options) + # Consume the generator in the postprocessor + postprocessor.process(conv_results, **params) + else: + output.mkdir(parents=True, exist_ok=True) + export_documents( + conv_results, + output_dir=output, + export_json=export_json, + export_html=export_html, + export_html_split_page=export_html_split_page, + show_layout=show_layout, + export_md=export_md, + export_txt=export_txt, + export_doctags=export_doctags, + image_export_mode=image_export_mode, + ) end_time = time.time() - start_time diff --git a/docling/document_converter.py b/docling/document_converter.py index 1c3149036..879c1fe67 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -49,6 +49,7 @@ ) from docling.exceptions import ConversionError from docling.pipeline.asr_pipeline import AsrPipeline +from docling.postprocess.base_postprocessor import BasePostprocessor from docling.pipeline.base_pipeline import BasePipeline from docling.pipeline.simple_pipeline import SimplePipeline from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline diff --git a/docling/pipeline/factory.py b/docling/pipeline/factory.py new file mode 100644 index 000000000..83e219ec6 --- /dev/null +++ b/docling/pipeline/factory.py @@ -0,0 +1,22 @@ +import logging +from functools import lru_cache + +from docling.models.factories.base_factory import BaseFactory +from docling.pipeline.base_pipeline import BasePipeline + +logger = logging.getLogger(__name__) + + +class PipelineFactory(BaseFactory[BasePipeline]): + def __init__(self, *args, **kwargs): + super().__init__("pipelines", *args, **kwargs) + + +@lru_cache +def get_pipeline_factory(allow_external_plugins: bool = False) -> PipelineFactory: + factory = PipelineFactory() + factory.load_from_plugins(allow_external_plugins=allow_external_plugins) + logger.info("Registered pipelines: %r", factory.registered_kind) + return factory + + diff --git a/docling/postprocess/base_postprocessor.py b/docling/postprocess/base_postprocessor.py new file mode 100644 index 000000000..1625f7de0 --- /dev/null +++ b/docling/postprocess/base_postprocessor.py @@ -0,0 +1,22 @@ +import abc +from collections.abc import Iterable +from typing import Any + +from docling.datamodel.document import ConversionResult + + +class BasePostprocessor(abc.ABC): + """Base interface for post-processing Docling conversion results. + + Implementations may index to vector stores, export to custom sinks, etc. + """ + + @abc.abstractmethod + def process(self, conv_results: Iterable[ConversionResult], **kwargs: Any) -> Any: + """Consume an iterator of ConversionResult and perform side-effects. + + Returns an optional result object (implementation-defined). + """ + raise NotImplementedError + + diff --git a/docling/postprocess/factory.py b/docling/postprocess/factory.py new file mode 100644 index 000000000..ff3503857 --- /dev/null +++ b/docling/postprocess/factory.py @@ -0,0 +1,22 @@ +import logging +from functools import lru_cache + +from docling.models.factories.base_factory import BaseFactory +from docling.postprocess.base_postprocessor import BasePostprocessor + +logger = logging.getLogger(__name__) + + +class PostprocessorFactory(BaseFactory[BasePostprocessor]): + def __init__(self, *args, **kwargs): + super().__init__("postprocessors", *args, **kwargs) + + +@lru_cache +def get_postprocessor_factory(allow_external_plugins: bool = False) -> PostprocessorFactory: + factory = PostprocessorFactory() + factory.load_from_plugins(allow_external_plugins=allow_external_plugins) + logger.info("Registered postprocessors: %r", factory.registered_kind) + return factory + +