Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 28 additions & 3 deletions packages/markitdown/src/markitdown/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,23 @@ def main():
help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
)

parser.add_argument(
"--extract-images",
action="store_true",
help="Extract embedded PDF images to files and reference them with relative Markdown links.",
)

parser.add_argument(
"--output-dir",
help="Directory for extracted assets. Required when using --extract-images.",
)

parser.add_argument("filename", nargs="?")
args = parser.parse_args()

if args.extract_images and args.output_dir is None:
_exit_with_error("--output-dir is required when using --extract-images.")

# Parse the extension hint
extension_hint = args.extension
if extension_hint is not None:
Expand Down Expand Up @@ -200,6 +214,13 @@ def main():
)
sys.exit(0)

markitdown_kwargs: Dict[str, Any] = {
"enable_plugins": args.use_plugins,
}
if args.extract_images:
markitdown_kwargs["extract_images"] = True
markitdown_kwargs["output_dir"] = args.output_dir

if args.use_docintel:
if args.endpoint is None:
_exit_with_error(
Expand All @@ -209,7 +230,8 @@ def main():
_exit_with_error("Filename is required when using Document Intelligence.")

markitdown = MarkItDown(
enable_plugins=args.use_plugins, docintel_endpoint=args.endpoint
**markitdown_kwargs,
docintel_endpoint=args.endpoint,
)
elif args.use_cu:
if args.cu_endpoint is None:
Expand Down Expand Up @@ -240,9 +262,12 @@ def main():
_exit_with_error(f"Unknown file type: {name}")
cu_kwargs["cu_file_types"] = cu_types

markitdown = MarkItDown(enable_plugins=args.use_plugins, **cu_kwargs)
markitdown = MarkItDown(
**markitdown_kwargs,
**cu_kwargs,
)
else:
markitdown = MarkItDown(enable_plugins=args.use_plugins)
markitdown = MarkItDown(**markitdown_kwargs)

if args.filename is None:
result = markitdown.convert_stream(
Expand Down
16 changes: 16 additions & 0 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,11 @@ def __init__(
self._llm_prompt: Union[str | None] = None
self._exiftool_path: Union[str | None] = None
self._style_map: Union[str | None] = None
self._extract_images: bool = bool(kwargs.get("extract_images", False))
self._output_dir: Union[str | Path | None] = kwargs.get("output_dir")

if self._extract_images and self._output_dir is None:
raise ValueError("output_dir is required when extract_images=True")

# Register the converters
self._converters: List[ConverterRegistration] = []
Expand Down Expand Up @@ -563,6 +568,11 @@ def _convert(
) -> DocumentConverterResult:
res: Union[None, DocumentConverterResult] = None

extract_images = bool(kwargs.get("extract_images", self._extract_images))
output_dir = kwargs.get("output_dir", self._output_dir)
if extract_images and output_dir is None:
raise ValueError("output_dir is required when extract_images=True")

# Keep track of which converters throw exceptions
failed_attempts: List[FailedConversionAttempt] = []

Expand Down Expand Up @@ -600,6 +610,12 @@ def _convert(
if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
_kwargs["exiftool_path"] = self._exiftool_path

if "extract_images" not in _kwargs:
_kwargs["extract_images"] = extract_images

if "output_dir" not in _kwargs and output_dir is not None:
_kwargs["output_dir"] = output_dir

# Add the list of converters for nested processing
_kwargs["_parent_converters"] = self._converters

Expand Down
Loading