Skip to content

Commit af1be36

Browse files
authored
Added CLI options for extension, mimetypes, and charset. (#1115)
1 parent 2a2ccc8 commit af1be36

File tree

2 files changed

+75
-8
lines changed

2 files changed

+75
-8
lines changed

packages/markitdown/src/markitdown/__main__.py

+72-5
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
# SPDX-License-Identifier: MIT
44
import argparse
55
import sys
6+
import codecs
67
from textwrap import dedent
78
from importlib.metadata import entry_points
89
from .__about__ import __version__
9-
from ._markitdown import MarkItDown, DocumentConverterResult
10+
from ._markitdown import MarkItDown, StreamInfo, DocumentConverterResult
1011

1112

1213
def main():
@@ -58,6 +59,24 @@ def main():
5859
help="Output file name. If not provided, output is written to stdout.",
5960
)
6061

62+
parser.add_argument(
63+
"-x",
64+
"--extension",
65+
help="Provide a hint about the file extension (e.g., when reading from stdin).",
66+
)
67+
68+
parser.add_argument(
69+
"-m",
70+
"--mime-type",
71+
help="Provide a hint about the file's MIME type.",
72+
)
73+
74+
parser.add_argument(
75+
"-c",
76+
"--charset",
77+
help="Provide a hint about the file's charset (e.g, UTF-8).",
78+
)
79+
6180
parser.add_argument(
6281
"-d",
6382
"--use-docintel",
@@ -88,6 +107,48 @@ def main():
88107
parser.add_argument("filename", nargs="?")
89108
args = parser.parse_args()
90109

110+
# Parse the extension hint
111+
extension_hint = args.extension
112+
if extension_hint is not None:
113+
extension_hint = extension_hint.strip().lower()
114+
if len(extension_hint) > 0:
115+
if not extension_hint.startswith("."):
116+
extension_hint = "." + extension_hint
117+
else:
118+
extension_hint = None
119+
120+
# Parse the mime type
121+
mime_type_hint = args.mime_type
122+
if mime_type_hint is not None:
123+
mime_type_hint = mime_type_hint.strip()
124+
if len(mime_type_hint) > 0:
125+
if mime_type_hint.count("/") != 1:
126+
_exit_with_error(f"Invalid MIME type: {mime_type_hint}")
127+
else:
128+
mime_type_hint = None
129+
130+
# Parse the charset
131+
charset_hint = args.charset
132+
if charset_hint is not None:
133+
charset_hint = charset_hint.strip()
134+
if len(charset_hint) > 0:
135+
try:
136+
charset_hint = codecs.lookup(charset_hint).name
137+
except LookupError:
138+
_exit_with_error(f"Invalid charset: {charset_hint}")
139+
else:
140+
charset_hint = None
141+
142+
stream_info: str | None = None
143+
if (
144+
extension_hint is not None
145+
or mime_type_hint is not None
146+
or charset_hint is not None
147+
):
148+
stream_info = StreamInfo(
149+
extension=extension_hint, mimetype=mime_type_hint, charset=charset_hint
150+
)
151+
91152
if args.list_plugins:
92153
# List installed plugins, then exit
93154
print("Installed MarkItDown 3rd-party Plugins:\n")
@@ -107,21 +168,22 @@ def main():
107168

108169
if args.use_docintel:
109170
if args.endpoint is None:
110-
raise ValueError(
171+
_exit_with_error(
111172
"Document Intelligence Endpoint is required when using Document Intelligence."
112173
)
113174
elif args.filename is None:
114-
raise ValueError("Filename is required when using Document Intelligence.")
175+
_exit_with_error("Filename is required when using Document Intelligence.")
176+
115177
markitdown = MarkItDown(
116178
enable_plugins=args.use_plugins, docintel_endpoint=args.endpoint
117179
)
118180
else:
119181
markitdown = MarkItDown(enable_plugins=args.use_plugins)
120182

121183
if args.filename is None:
122-
result = markitdown.convert_stream(sys.stdin.buffer)
184+
result = markitdown.convert_stream(sys.stdin.buffer, stream_info=stream_info)
123185
else:
124-
result = markitdown.convert(args.filename)
186+
result = markitdown.convert(args.filename, stream_info=stream_info)
125187

126188
_handle_output(args, result)
127189

@@ -135,5 +197,10 @@ def _handle_output(args, result: DocumentConverterResult):
135197
print(result.text_content)
136198

137199

200+
def _exit_with_error(message: str):
201+
print(message)
202+
sys.exit(1)
203+
204+
138205
if __name__ == "__main__":
139206
main()

packages/markitdown/src/markitdown/_markitdown.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -244,22 +244,22 @@ def convert(
244244
or source.startswith("https://")
245245
or source.startswith("file://")
246246
):
247-
return self.convert_url(source, **kwargs)
247+
return self.convert_url(source, stream_info=stream_info, *kwargs)
248248
else:
249249
return self.convert_local(source, stream_info=stream_info, **kwargs)
250250
# Path object
251251
elif isinstance(source, Path):
252252
return self.convert_local(source, stream_info=stream_info, **kwargs)
253253
# Request response
254254
elif isinstance(source, requests.Response):
255-
return self.convert_response(source, **kwargs)
255+
return self.convert_response(source, stream_info=stream_info, **kwargs)
256256
# Binary stream
257257
elif (
258258
hasattr(source, "read")
259259
and callable(source.read)
260260
and not isinstance(source, io.TextIOBase)
261261
):
262-
return self.convert_stream(source, **kwargs)
262+
return self.convert_stream(source, stream_info=stream_info, **kwargs)
263263
else:
264264
raise TypeError(
265265
f"Invalid source type: {type(source)}. Expected str, requests.Response, BinaryIO."

0 commit comments

Comments
 (0)