Skip to content

Commit 3c97466

Browse files
test: ability to accept gzip compressed files (#382)
The PR adds tests: - unit tests comparing response between gzipped and not-gzipped files, for different content types, also for non-set ungzipped content type, and for single and multiple files in various combinations. - simple smoke tests --------- Co-authored-by: Austin Walker <[email protected]>
1 parent 648ad7b commit 3c97466

File tree

8 files changed

+407
-112
lines changed

8 files changed

+407
-112
lines changed

README.md

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ for PDFs and Images, which are `pdf`, `jpg` and `png`. Again, please note that t
167167
You can specify the encoding to use to decode the text input. If no value is provided, utf-8 will be used.
168168

169169
```
170-
curl -X 'POST'
170+
curl -X 'POST' \
171171
'https://api.unstructured.io/general/v0/general' \
172172
-H 'accept: application/json' \
173173
-H 'Content-Type: multipart/form-data' \
@@ -176,6 +176,23 @@ curl -X 'POST'
176176
| jq -C . | less -R
177177
```
178178

179+
#### Gzipped files
180+
181+
You can send gzipped file and api will un-gzip it.
182+
183+
```
184+
curl -X 'POST' \
185+
'https://api.unstructured.io/general/v0/general' \
186+
-H 'accept: application/json' \
187+
-H 'Content-Type: multipart/form-data' \
188+
-F 'gz_uncompressed_content_type=application/pdf' \
189+
-F 'files=@sample-docs/layout-parser-paper.pdf.gz'
190+
```
191+
192+
If field `gz_uncompressed_content_type` is set, the API will use its value as content-type of all files
193+
after uncompressing the .gz files that are sent in single batch. If not set, the API will use
194+
various heuristics to detect the filetypes after uncompressing from .gz.
195+
179196
#### XML Tags
180197

181198
When processing XML documents, set the `xml_keep_tags` parameter to `true` to retain the XML tags in the output. If not specified, it will simply extract the text from within the tags.

prepline_general/api/general.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -749,11 +749,11 @@ def general_partition(
749749
chunking_strategy = _validate_chunking_strategy(form_params.chunking_strategy)
750750

751751
# -- unzip any uploaded files that need it --
752-
for file_index in range(len(files)):
753-
if files[file_index].content_type == "application/gzip":
754-
files[file_index] = ungz_file(
755-
files[file_index], form_params.gz_uncompressed_content_type
756-
)
752+
for idx, file in enumerate(files):
753+
is_content_type_gz = file.content_type == "application/gzip"
754+
is_extension_gz = file.filename and file.filename.endswith(".gz")
755+
if is_content_type_gz or is_extension_gz:
756+
files[idx] = ungz_file(file, form_params.gz_uncompressed_content_type)
757757

758758
def response_generator(is_multipart: bool):
759759
for file in files:

requirements/base.txt

Lines changed: 33 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ beautifulsoup4==4.12.3
1919
certifi==2024.2.2
2020
# via
2121
# requests
22+
# unstructured
2223
# unstructured-client
2324
cffi==1.16.0
2425
# via cryptography
@@ -28,6 +29,7 @@ charset-normalizer==3.3.2
2829
# via
2930
# pdfminer-six
3031
# requests
32+
# unstructured
3133
# unstructured-client
3234
click==8.1.3
3335
# via
@@ -38,13 +40,15 @@ coloredlogs==15.0.1
3840
# via onnxruntime
3941
contourpy==1.2.0
4042
# via matplotlib
41-
cryptography==42.0.4
43+
cryptography==42.0.5
4244
# via pdfminer-six
4345
cycler==0.12.1
4446
# via matplotlib
4547
dataclasses-json==0.6.4
46-
# via unstructured
47-
dataclasses-json-speakeasy==0.5.11
48+
# via
49+
# unstructured
50+
# unstructured-client
51+
deepdiff==6.7.1
4852
# via unstructured-client
4953
deprecated==1.2.14
5054
# via pikepdf
@@ -56,7 +60,7 @@ et-xmlfile==1.1.0
5660
# via openpyxl
5761
exceptiongroup==1.2.0
5862
# via anyio
59-
fastapi==0.109.2
63+
fastapi==0.110.0
6064
# via -r requirements/base.in
6165
filelock==3.13.1
6266
# via
@@ -65,7 +69,7 @@ filelock==3.13.1
6569
# transformers
6670
filetype==1.2.0
6771
# via unstructured
68-
flatbuffers==23.5.26
72+
flatbuffers==24.3.7
6973
# via onnxruntime
7074
fonttools==4.49.0
7175
# via matplotlib
@@ -75,7 +79,7 @@ fsspec==2024.2.0
7579
# torch
7680
h11==0.14.0
7781
# via uvicorn
78-
huggingface-hub==0.20.3
82+
huggingface-hub==0.21.4
7983
# via
8084
# timm
8185
# tokenizers
@@ -112,10 +116,9 @@ markdown==3.5.2
112116
# via unstructured
113117
markupsafe==2.1.5
114118
# via jinja2
115-
marshmallow==3.20.2
119+
marshmallow==3.21.1
116120
# via
117121
# dataclasses-json
118-
# dataclasses-json-speakeasy
119122
# unstructured-client
120123
matplotlib==3.8.3
121124
# via pycocotools
@@ -163,7 +166,9 @@ opencv-python==4.9.0.80
163166
# unstructured-inference
164167
openpyxl==3.1.2
165168
# via unstructured
166-
packaging==23.2
169+
ordered-set==4.1.0
170+
# via deepdiff
171+
packaging==24.0
167172
# via
168173
# huggingface-hub
169174
# marshmallow
@@ -174,19 +179,19 @@ packaging==23.2
174179
# transformers
175180
# unstructured-client
176181
# unstructured-pytesseract
177-
pandas==2.2.0
182+
pandas==2.2.1
178183
# via
179184
# layoutparser
180185
# unstructured
181186
pdf2image==1.17.0
182187
# via
183188
# layoutparser
184189
# unstructured
185-
pdfminer-six==20221105
190+
pdfminer-six==20231228
186191
# via
187192
# pdfplumber
188193
# unstructured
189-
pdfplumber==0.10.4
194+
pdfplumber==0.11.0
190195
# via layoutparser
191196
pikepdf==8.13.0
192197
# via unstructured
@@ -218,23 +223,24 @@ pycparser==2.21
218223
# via cffi
219224
pycryptodome==3.20.0
220225
# via -r requirements/base.in
221-
pydantic==2.6.1
226+
pydantic==2.6.4
222227
# via fastapi
223-
pydantic-core==2.16.2
228+
pydantic-core==2.16.3
224229
# via pydantic
225230
pypandoc==1.13
226231
# via unstructured
227-
pyparsing==3.1.1
232+
pyparsing==3.1.2
228233
# via matplotlib
229-
pypdf==4.0.2
234+
pypdf==4.1.0
230235
# via
231236
# -r requirements/base.in
232237
# unstructured
233-
pypdfium2==4.27.0
238+
# unstructured-client
239+
pypdfium2==4.28.0
234240
# via pdfplumber
235241
pytesseract==0.3.10
236242
# via layoutparser
237-
python-dateutil==2.8.2
243+
python-dateutil==2.9.0.post0
238244
# via
239245
# matplotlib
240246
# pandas
@@ -258,7 +264,7 @@ pyyaml==6.0.1
258264
# omegaconf
259265
# timm
260266
# transformers
261-
rapidfuzz==3.6.1
267+
rapidfuzz==3.6.2
262268
# via
263269
# unstructured
264270
# unstructured-inference
@@ -287,7 +293,7 @@ six==1.16.0
287293
# langdetect
288294
# python-dateutil
289295
# unstructured-client
290-
sniffio==1.3.0
296+
sniffio==1.3.1
291297
# via anyio
292298
soupsieve==2.5
293299
# via beautifulsoup4
@@ -322,7 +328,7 @@ tqdm==4.66.2
322328
# transformers
323329
transformers==4.37.1
324330
# via unstructured-inference
325-
typing-extensions==4.9.0
331+
typing-extensions==4.10.0
326332
# via
327333
# anyio
328334
# fastapi
@@ -339,13 +345,14 @@ typing-extensions==4.9.0
339345
typing-inspect==0.9.0
340346
# via
341347
# dataclasses-json
342-
# dataclasses-json-speakeasy
343348
# unstructured-client
344349
tzdata==2024.1
345350
# via pandas
346-
unstructured[local-inference]==0.12.4
347-
# via -r requirements/base.in
348-
unstructured-client==0.18.0
351+
unstructured[local-inference]==0.12.5
352+
# via
353+
# -r requirements/base.in
354+
# unstructured
355+
unstructured-client==0.21.1
349356
# via unstructured
350357
unstructured-inference==0.7.23
351358
# via unstructured
@@ -355,7 +362,7 @@ urllib3==2.2.1
355362
# via
356363
# requests
357364
# unstructured-client
358-
uvicorn==0.27.1
365+
uvicorn==0.28.0
359366
# via -r requirements/base.in
360367
wrapt==1.16.0
361368
# via

requirements/test.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ pytest-mock
1111
nbdev
1212
jupyter
1313
httpx
14+
deepdiff

0 commit comments

Comments
 (0)