3
3
# SPDX-License-Identifier: MIT
4
4
import argparse
5
5
import sys
6
+ import codecs
6
7
from textwrap import dedent
7
8
from importlib .metadata import entry_points
8
9
from .__about__ import __version__
9
- from ._markitdown import MarkItDown , DocumentConverterResult
10
+ from ._markitdown import MarkItDown , StreamInfo , DocumentConverterResult
10
11
11
12
12
13
def main ():
@@ -58,6 +59,24 @@ def main():
58
59
help = "Output file name. If not provided, output is written to stdout." ,
59
60
)
60
61
62
+ parser .add_argument (
63
+ "-x" ,
64
+ "--extension" ,
65
+ help = "Provide a hint about the file extension (e.g., when reading from stdin)." ,
66
+ )
67
+
68
+ parser .add_argument (
69
+ "-m" ,
70
+ "--mime-type" ,
71
+ help = "Provide a hint about the file's MIME type." ,
72
+ )
73
+
74
+ parser .add_argument (
75
+ "-c" ,
76
+ "--charset" ,
77
+ help = "Provide a hint about the file's charset (e.g, UTF-8)." ,
78
+ )
79
+
61
80
parser .add_argument (
62
81
"-d" ,
63
82
"--use-docintel" ,
@@ -88,6 +107,48 @@ def main():
88
107
parser .add_argument ("filename" , nargs = "?" )
89
108
args = parser .parse_args ()
90
109
110
+ # Parse the extension hint
111
+ extension_hint = args .extension
112
+ if extension_hint is not None :
113
+ extension_hint = extension_hint .strip ().lower ()
114
+ if len (extension_hint ) > 0 :
115
+ if not extension_hint .startswith ("." ):
116
+ extension_hint = "." + extension_hint
117
+ else :
118
+ extension_hint = None
119
+
120
+ # Parse the mime type
121
+ mime_type_hint = args .mime_type
122
+ if mime_type_hint is not None :
123
+ mime_type_hint = mime_type_hint .strip ()
124
+ if len (mime_type_hint ) > 0 :
125
+ if mime_type_hint .count ("/" ) != 1 :
126
+ _exit_with_error (f"Invalid MIME type: { mime_type_hint } " )
127
+ else :
128
+ mime_type_hint = None
129
+
130
+ # Parse the charset
131
+ charset_hint = args .charset
132
+ if charset_hint is not None :
133
+ charset_hint = charset_hint .strip ()
134
+ if len (charset_hint ) > 0 :
135
+ try :
136
+ charset_hint = codecs .lookup (charset_hint ).name
137
+ except LookupError :
138
+ _exit_with_error (f"Invalid charset: { charset_hint } " )
139
+ else :
140
+ charset_hint = None
141
+
142
+ stream_info : str | None = None
143
+ if (
144
+ extension_hint is not None
145
+ or mime_type_hint is not None
146
+ or charset_hint is not None
147
+ ):
148
+ stream_info = StreamInfo (
149
+ extension = extension_hint , mimetype = mime_type_hint , charset = charset_hint
150
+ )
151
+
91
152
if args .list_plugins :
92
153
# List installed plugins, then exit
93
154
print ("Installed MarkItDown 3rd-party Plugins:\n " )
@@ -107,21 +168,22 @@ def main():
107
168
108
169
if args .use_docintel :
109
170
if args .endpoint is None :
110
- raise ValueError (
171
+ _exit_with_error (
111
172
"Document Intelligence Endpoint is required when using Document Intelligence."
112
173
)
113
174
elif args .filename is None :
114
- raise ValueError ("Filename is required when using Document Intelligence." )
175
+ _exit_with_error ("Filename is required when using Document Intelligence." )
176
+
115
177
markitdown = MarkItDown (
116
178
enable_plugins = args .use_plugins , docintel_endpoint = args .endpoint
117
179
)
118
180
else :
119
181
markitdown = MarkItDown (enable_plugins = args .use_plugins )
120
182
121
183
if args .filename is None :
122
- result = markitdown .convert_stream (sys .stdin .buffer )
184
+ result = markitdown .convert_stream (sys .stdin .buffer , stream_info = stream_info )
123
185
else :
124
- result = markitdown .convert (args .filename )
186
+ result = markitdown .convert (args .filename , stream_info = stream_info )
125
187
126
188
_handle_output (args , result )
127
189
@@ -135,5 +197,10 @@ def _handle_output(args, result: DocumentConverterResult):
135
197
print (result .text_content )
136
198
137
199
200
+ def _exit_with_error (message : str ):
201
+ print (message )
202
+ sys .exit (1 )
203
+
204
+
138
205
if __name__ == "__main__" :
139
206
main ()
0 commit comments