11from abc import ABC , abstractmethod
22from contextlib import AbstractContextManager
33from itertools import islice
4- from typing import Any , Callable , Generic , Iterator , Sequence , TypeVar
4+ from typing import Callable , Generic , Iterator , Sequence , TypeVar
55
6- from pydantic import BaseModel , ConfigDict , Field
6+ from pydantic import BaseModel , ConfigDict
77
88from docling .datamodel .base_models import DocumentStream
99
@@ -21,11 +21,16 @@ class SourceDocumentRef(BaseModel, Generic[FileIdentifierT]):
2121 source_index : int
2222 source_uri : str
2323 filename : str
24- metadata : dict [str , Any ] = Field (default_factory = dict )
2524
2625
2726class DocumentChunk (BaseModel , Generic [SourceT , FileIdentifierT ]):
28- """A data-only source chunk plus a local fetcher convenience."""
27+ """A serializable source chunk plus an optional local fetcher convenience.
28+
29+ Local/CLI callers may attach a fetcher so ``iter_documents()`` can materialize
30+ streams lazily from the refs. Cross-process callers such as Ray must strip that
31+ fetcher because it may capture initialized connector state that is not safe to
32+ serialize.
33+ """
2934
3035 model_config = ConfigDict (arbitrary_types_allowed = True )
3136
@@ -53,6 +58,7 @@ def index(self) -> int:
5358 return self .chunk_index
5459
5560 def iter_documents (self ) -> Iterator [DocumentStream ]:
61+ """Materialize documents for local callers when a fetcher is attached."""
5662 if self ._fetcher is None :
5763 raise RuntimeError ("DocumentChunk does not have an attached fetcher." )
5864 for ref in self .refs :
@@ -118,19 +124,21 @@ def source(self) -> SourceT:
118124 def _count_documents (self ) -> int | None :
119125 return None
120126
121- def fetch_document_by_ref (
122- self , ref : SourceDocumentRef [FileIdentifierT ]
123- ) -> DocumentStream :
124- return self ._fetch_document_by_id (ref .id )
125-
126127 def fetch_converter_source_by_ref (
127128 self , ref : SourceDocumentRef [FileIdentifierT ]
128129 ) -> ConverterSource :
129- return self .fetch_document_by_ref (ref )
130+ """Resolve a ref into the converter input expected by the backend.
131+
132+ Most connectors materialize a ``DocumentStream`` from the ref's identifier.
133+ Connectors with remote-fetch semantics may override this to return a lighter
134+ representation such as a source URL.
135+ """
136+ return self ._fetch_document_by_id (ref .id )
130137
131138 def headers_for_ref (
132139 self , ref : SourceDocumentRef [FileIdentifierT ]
133- ) -> dict [str , Any ] | None :
140+ ) -> dict [str , object ] | None :
141+ """Return per-ref request headers when the converter should fetch remotely."""
134142 del ref
135143 return None
136144
0 commit comments