Unstructured-IO
diff --git a/‎README.md‎
Lines changed: 18 additions & 1 deletion b/‎README.md‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎prepline_general/api/general.py‎
Lines changed: 5 additions & 5 deletions b/‎prepline_general/api/general.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎requirements/base.txt‎
Lines changed: 33 additions & 26 deletions b/‎requirements/base.txt‎
Lines changed: 33 additions & 26 deletions
diff --git a/‎requirements/test.in‎
Lines changed: 1 addition & 0 deletions b/‎requirements/test.in‎
Lines changed: 1 addition & 0 deletions
@@ -167,7 +167,7 @@ for PDFs and Images, which are `pdf`, `jpg` and `png`. Again, please note that t
 You can specify the encoding to use to decode the text input. If no value is provided, utf-8 will be used.
 
 ```
-curl -X 'POST' 
+curl -X 'POST' \
  'https://api.unstructured.io/general/v0/general' \
  -H 'accept: application/json'  \
  -H 'Content-Type: multipart/form-data' \
@@ -176,6 +176,23 @@ curl -X 'POST'
  | jq -C . | less -R
 ```
 
+#### Gzipped files
+
+You can send gzipped file and api will un-gzip it. 
+
+```
+curl -X 'POST' \
+ 'https://api.unstructured.io/general/v0/general' \
+ -H 'accept: application/json'  \
+ -H 'Content-Type: multipart/form-data' \
+ -F 'gz_uncompressed_content_type=application/pdf' \
+ -F 'files=@sample-docs/layout-parser-paper.pdf.gz' 
+```
+
+If field `gz_uncompressed_content_type` is set, the API will use its value as content-type of all files
+after uncompressing the .gz files that are sent in single batch. If not set, the API will use
+various heuristics to detect the filetypes after uncompressing from .gz.
+
 #### XML Tags
 
 When processing XML documents, set the `xml_keep_tags` parameter to `true` to retain the XML tags in the output. If not specified, it will simply extract the text from within the tags.
 
@@ -749,11 +749,11 @@ def general_partition(
     chunking_strategy = _validate_chunking_strategy(form_params.chunking_strategy)
 
     # -- unzip any uploaded files that need it --
-    for file_index in range(len(files)):
-        if files[file_index].content_type == "application/gzip":
-            files[file_index] = ungz_file(
-                files[file_index], form_params.gz_uncompressed_content_type
-            )
+    for idx, file in enumerate(files):
+        is_content_type_gz = file.content_type == "application/gzip"
+        is_extension_gz = file.filename and file.filename.endswith(".gz")
+        if is_content_type_gz or is_extension_gz:
+            files[idx] = ungz_file(file, form_params.gz_uncompressed_content_type)
 
     def response_generator(is_multipart: bool):
         for file in files:
 
@@ -19,6 +19,7 @@ beautifulsoup4==4.12.3
 certifi==2024.2.2
     # via
     #   requests
+    #   unstructured
     #   unstructured-client
 cffi==1.16.0
     # via cryptography
@@ -28,6 +29,7 @@ charset-normalizer==3.3.2
     # via
     #   pdfminer-six
     #   requests
+    #   unstructured
     #   unstructured-client
 click==8.1.3
     # via
@@ -38,13 +40,15 @@ coloredlogs==15.0.1
     # via onnxruntime
 contourpy==1.2.0
     # via matplotlib
-cryptography==42.0.4
+cryptography==42.0.5
     # via pdfminer-six
 cycler==0.12.1
     # via matplotlib
 dataclasses-json==0.6.4
-    # via unstructured
-dataclasses-json-speakeasy==0.5.11
+    # via
+    #   unstructured
+    #   unstructured-client
+deepdiff==6.7.1
     # via unstructured-client
 deprecated==1.2.14
     # via pikepdf
@@ -56,7 +60,7 @@ et-xmlfile==1.1.0
     # via openpyxl
 exceptiongroup==1.2.0
     # via anyio
-fastapi==0.109.2
+fastapi==0.110.0
     # via -r requirements/base.in
 filelock==3.13.1
     # via
@@ -65,7 +69,7 @@ filelock==3.13.1
     #   transformers
 filetype==1.2.0
     # via unstructured
-flatbuffers==23.5.26
+flatbuffers==24.3.7
     # via onnxruntime
 fonttools==4.49.0
     # via matplotlib
@@ -75,7 +79,7 @@ fsspec==2024.2.0
     #   torch
 h11==0.14.0
     # via uvicorn
-huggingface-hub==0.20.3
+huggingface-hub==0.21.4
     # via
     #   timm
     #   tokenizers
@@ -112,10 +116,9 @@ markdown==3.5.2
     # via unstructured
 markupsafe==2.1.5
     # via jinja2
-marshmallow==3.20.2
+marshmallow==3.21.1
     # via
     #   dataclasses-json
-    #   dataclasses-json-speakeasy
     #   unstructured-client
 matplotlib==3.8.3
     # via pycocotools
@@ -163,7 +166,9 @@ opencv-python==4.9.0.80
     #   unstructured-inference
 openpyxl==3.1.2
     # via unstructured
-packaging==23.2
+ordered-set==4.1.0
+    # via deepdiff
+packaging==24.0
     # via
     #   huggingface-hub
     #   marshmallow
@@ -174,19 +179,19 @@ packaging==23.2
     #   transformers
     #   unstructured-client
     #   unstructured-pytesseract
-pandas==2.2.0
+pandas==2.2.1
     # via
     #   layoutparser
     #   unstructured
 pdf2image==1.17.0
     # via
     #   layoutparser
     #   unstructured
-pdfminer-six==20221105
+pdfminer-six==20231228
     # via
     #   pdfplumber
     #   unstructured
-pdfplumber==0.10.4
+pdfplumber==0.11.0
     # via layoutparser
 pikepdf==8.13.0
     # via unstructured
@@ -218,23 +223,24 @@ pycparser==2.21
     # via cffi
 pycryptodome==3.20.0
     # via -r requirements/base.in
-pydantic==2.6.1
+pydantic==2.6.4
     # via fastapi
-pydantic-core==2.16.2
+pydantic-core==2.16.3
     # via pydantic
 pypandoc==1.13
     # via unstructured
-pyparsing==3.1.1
+pyparsing==3.1.2
     # via matplotlib
-pypdf==4.0.2
+pypdf==4.1.0
     # via
     #   -r requirements/base.in
     #   unstructured
-pypdfium2==4.27.0
+    #   unstructured-client
+pypdfium2==4.28.0
     # via pdfplumber
 pytesseract==0.3.10
     # via layoutparser
-python-dateutil==2.8.2
+python-dateutil==2.9.0.post0
     # via
     #   matplotlib
     #   pandas
@@ -258,7 +264,7 @@ pyyaml==6.0.1
     #   omegaconf
     #   timm
     #   transformers
-rapidfuzz==3.6.1
+rapidfuzz==3.6.2
     # via
     #   unstructured
     #   unstructured-inference
@@ -287,7 +293,7 @@ six==1.16.0
     #   langdetect
     #   python-dateutil
     #   unstructured-client
-sniffio==1.3.0
+sniffio==1.3.1
     # via anyio
 soupsieve==2.5
     # via beautifulsoup4
@@ -322,7 +328,7 @@ tqdm==4.66.2
     #   transformers
 transformers==4.37.1
     # via unstructured-inference
-typing-extensions==4.9.0
+typing-extensions==4.10.0
     # via
     #   anyio
     #   fastapi
@@ -339,13 +345,14 @@ typing-extensions==4.9.0
 typing-inspect==0.9.0
     # via
     #   dataclasses-json
-    #   dataclasses-json-speakeasy
     #   unstructured-client
 tzdata==2024.1
     # via pandas
-unstructured[local-inference]==0.12.4
-    # via -r requirements/base.in
-unstructured-client==0.18.0
+unstructured[local-inference]==0.12.5
+    # via
+    #   -r requirements/base.in
+    #   unstructured
+unstructured-client==0.21.1
     # via unstructured
 unstructured-inference==0.7.23
     # via unstructured
@@ -355,7 +362,7 @@ urllib3==2.2.1
     # via
     #   requests
     #   unstructured-client
-uvicorn==0.27.1
+uvicorn==0.28.0
     # via -r requirements/base.in
 wrapt==1.16.0
     # via
 
@@ -11,3 +11,4 @@ pytest-mock
 nbdev
 jupyter
 httpx
+deepdiff
-Original file line number
+Diff line change
 nbdev
 jupyter
 httpx
 +deepdiff