Skip to content

Commit 528849e

Browse files
authored
chore: bump unstructured to 0.10.21 (#280)
Add a workaround to the dockerfile for an import error when fetching the models (see Unstructured-IO/unstructured#1717)
1 parent 5b8a57f commit 528849e

File tree

7 files changed

+108
-87
lines changed

7 files changed

+108
-87
lines changed

.github/workflows/ci.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ jobs:
5656
uses: ludeeus/action-shellcheck@master
5757

5858
test:
59-
runs-on: ubuntu-latest
59+
runs-on: ubuntu-latest-m
6060
needs: [setup, lint]
6161
steps:
6262
- uses: actions/checkout@v4
@@ -97,7 +97,7 @@ jobs:
9797
# TODO - figure out best practice for caching docker images
9898
# (Using the virtualenv to get pytest)
9999
test_dockerfile:
100-
runs-on: ubuntu-latest
100+
runs-on: ubuntu-latest-m
101101
needs: [setup, lint]
102102
steps:
103103
- uses: actions/checkout@v4

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
## 0.0.52-dev1
1+
## 0.0.52
22

3+
* Bump unstructured to 0.10.21
34
* Fix an unhandled error when a non pdf file is sent with content-type pdf
45
* Fix unhandled error when a non docx file is sent with content-type docx
56

Dockerfile

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,14 @@ RUN python3.10 -m pip install pip==${PIP_VERSION} \
3434
USER ${NB_USER}
3535

3636
FROM python-deps as model-deps
37+
38+
# Note(Austin) - Unstructured 0.10.20 has some broken imports in ingest
39+
# Not relevant here - remove the imports for now
3740
RUN python3.10 -c "import nltk; nltk.download('punkt')" && \
3841
python3.10 -c "import nltk; nltk.download('averaged_perceptron_tagger')" && \
39-
python3.10 -c "from unstructured.ingest.doc_processor.generalized import initialize; initialize()"
42+
sed -i '/Chunker/d' ~/.local/lib/python3.10/site-packages/unstructured/ingest/pipeline/__init__.py && \
43+
sed -i '/Embedder/d' ~/.local/lib/python3.10/site-packages/unstructured/ingest/pipeline/__init__.py && \
44+
python3.10 -c "from unstructured.ingest.pipeline.initialize import initialize; initialize()"
4045

4146
FROM model-deps as code
4247
COPY --chown=${NB_USER}:${NB_USER} CHANGELOG.md CHANGELOG.md

prepline_general/api/general.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -237,12 +237,12 @@ def pipeline_api(
237237
m_encoding=[],
238238
m_hi_res_model_name=[],
239239
m_include_page_breaks=[],
240-
m_ocr_languages=[],
240+
m_ocr_languages=None,
241241
m_pdf_infer_table_structure=[],
242242
m_skip_infer_table_types=[],
243243
m_strategy=[],
244244
m_xml_keep_tags=[],
245-
languages=["eng"],
245+
languages=None,
246246
m_chunking_strategy=[],
247247
m_multipage_sections=[],
248248
m_combine_under_n_chars=[],
@@ -608,12 +608,12 @@ def pipeline_1(
608608
encoding: List[str] = Form(default=[]),
609609
hi_res_model_name: List[str] = Form(default=[]),
610610
include_page_breaks: List[str] = Form(default=[]),
611-
ocr_languages: List[str] = Form(default=[]),
611+
ocr_languages: List[str] = Form(default=None),
612612
pdf_infer_table_structure: List[str] = Form(default=[]),
613613
skip_infer_table_types: List[str] = Form(default=[]),
614614
strategy: List[str] = Form(default=[]),
615615
xml_keep_tags: List[str] = Form(default=[]),
616-
languages: List[str] = ["eng"],
616+
languages: List[str] = Form(default=None),
617617
chunking_strategy: List[str] = Form(default=[]),
618618
multipage_sections: List[str] = Form(default=[]),
619619
combine_under_n_chars: List[str] = Form(default=[]),

requirements/base.txt

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@ anyio==3.7.1
1111
# fastapi
1212
# starlette
1313
backoff==2.2.1
14-
# via -r requirements/base.in
14+
# via
15+
# -r requirements/base.in
16+
# unstructured
1517
beautifulsoup4==4.12.2
1618
# via unstructured
1719
certifi==2023.7.22
@@ -35,7 +37,7 @@ contourpy==1.1.1
3537
# via matplotlib
3638
cryptography==41.0.4
3739
# via pdfminer-six
38-
cycler==0.12.0
40+
cycler==0.12.1
3941
# via matplotlib
4042
dataclasses-json==0.6.1
4143
# via unstructured
@@ -60,15 +62,15 @@ filetype==1.2.0
6062
# via unstructured
6163
flatbuffers==23.5.26
6264
# via onnxruntime
63-
fonttools==4.43.0
65+
fonttools==4.43.1
6466
# via matplotlib
6567
fsspec==2023.9.2
6668
# via
6769
# huggingface-hub
6870
# torch
6971
h11==0.14.0
7072
# via uvicorn
71-
huggingface-hub==0.16.4
73+
huggingface-hub==0.17.3
7274
# via
7375
# timm
7476
# tokenizers
@@ -98,7 +100,7 @@ lxml==4.9.3
98100
# python-docx
99101
# python-pptx
100102
# unstructured
101-
markdown==3.4.4
103+
markdown==3.5
102104
# via unstructured
103105
markupsafe==2.1.3
104106
# via jinja2
@@ -136,7 +138,7 @@ omegaconf==2.3.0
136138
# via effdet
137139
onnx==1.14.1
138140
# via unstructured-inference
139-
onnxruntime==1.16.0
141+
onnxruntime==1.15.1
140142
# via unstructured-inference
141143
opencv-python==4.8.1.78
142144
# via
@@ -199,17 +201,17 @@ pypandoc==1.11
199201
# via unstructured
200202
pyparsing==3.1.1
201203
# via matplotlib
202-
pypdf==3.16.2
204+
pypdf==3.16.4
203205
# via -r requirements/base.in
204-
pypdfium2==4.20.0
206+
pypdfium2==4.21.0
205207
# via pdfplumber
206208
pytesseract==0.3.10
207209
# via layoutparser
208210
python-dateutil==2.8.2
209211
# via
210212
# matplotlib
211213
# pandas
212-
python-docx==0.8.11
214+
python-docx==1.0.0
213215
# via unstructured
214216
python-iso639==2023.6.15
215217
# via unstructured
@@ -228,8 +230,10 @@ pyyaml==6.0.1
228230
# omegaconf
229231
# timm
230232
# transformers
231-
rapidfuzz==3.3.1
232-
# via unstructured-inference
233+
rapidfuzz==3.4.0
234+
# via
235+
# unstructured
236+
# unstructured-inference
233237
ratelimit==2.2.1
234238
# via -r requirements/base.in
235239
regex==2023.10.3
@@ -269,7 +273,7 @@ tabulate==0.9.0
269273
# via unstructured
270274
timm==0.9.7
271275
# via effdet
272-
tokenizers==0.14.0
276+
tokenizers==0.14.1
273277
# via transformers
274278
torch==2.1.0
275279
# via
@@ -298,16 +302,17 @@ typing-extensions==4.8.0
298302
# onnx
299303
# pydantic
300304
# pydantic-core
305+
# python-docx
301306
# torch
302307
# typing-inspect
303308
# uvicorn
304309
typing-inspect==0.9.0
305310
# via dataclasses-json
306311
tzdata==2023.3
307312
# via pandas
308-
unstructured[local-inference]==0.10.19
313+
unstructured[local-inference]==0.10.21
309314
# via -r requirements/base.in
310-
unstructured-inference==0.6.6
315+
unstructured-inference==0.7.2
311316
# via unstructured
312317
unstructured-pytesseract==0.3.12
313318
# via unstructured
@@ -317,5 +322,5 @@ uvicorn==0.23.2
317322
# via -r requirements/base.in
318323
xlrd==2.0.1
319324
# via unstructured
320-
xlsxwriter==3.1.6
325+
xlsxwriter==3.1.7
321326
# via python-pptx

requirements/test.txt

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -42,15 +42,17 @@ babel==2.13.0
4242
backcall==0.2.0
4343
# via ipython
4444
backoff==2.2.1
45-
# via -r requirements/base.txt
45+
# via
46+
# -r requirements/base.txt
47+
# unstructured
4648
beautifulsoup4==4.12.2
4749
# via
4850
# -r requirements/base.txt
4951
# nbconvert
5052
# unstructured
5153
black==23.9.1
5254
# via -r requirements/test.in
53-
bleach==6.0.0
55+
bleach==6.1.0
5456
# via nbconvert
5557
certifi==2023.7.22
5658
# via
@@ -97,7 +99,7 @@ cryptography==41.0.4
9799
# via
98100
# -r requirements/base.txt
99101
# pdfminer-six
100-
cycler==0.12.0
102+
cycler==0.12.1
101103
# via
102104
# -r requirements/base.txt
103105
# matplotlib
@@ -162,7 +164,7 @@ flatbuffers==23.5.26
162164
# via
163165
# -r requirements/base.txt
164166
# onnxruntime
165-
fonttools==4.43.0
167+
fonttools==4.43.1
166168
# via
167169
# -r requirements/base.txt
168170
# matplotlib
@@ -184,7 +186,7 @@ httpcore==0.18.0
184186
# via httpx
185187
httpx==0.25.0
186188
# via -r requirements/test.in
187-
huggingface-hub==0.16.4
189+
huggingface-hub==0.17.3
188190
# via
189191
# -r requirements/base.txt
190192
# timm
@@ -253,7 +255,7 @@ jsonschema-specifications==2023.7.1
253255
# via jsonschema
254256
jupyter==1.0.0
255257
# via -r requirements/test.in
256-
jupyter-client==8.3.1
258+
jupyter-client==8.4.0
257259
# via
258260
# ipykernel
259261
# jupyter-console
@@ -262,7 +264,7 @@ jupyter-client==8.3.1
262264
# qtconsole
263265
jupyter-console==6.6.3
264266
# via jupyter
265-
jupyter-core==5.3.2
267+
jupyter-core==5.4.0
266268
# via
267269
# ipykernel
268270
# jupyter-client
@@ -315,7 +317,7 @@ lxml==4.9.3
315317
# python-docx
316318
# python-pptx
317319
# unstructured
318-
markdown==3.4.4
320+
markdown==3.5
319321
# via
320322
# -r requirements/base.txt
321323
# unstructured
@@ -348,7 +350,7 @@ msg-parser==1.2.0
348350
# via
349351
# -r requirements/base.txt
350352
# unstructured
351-
mypy==1.5.1
353+
mypy==1.6.0
352354
# via -r requirements/test.in
353355
mypy-extensions==1.0.0
354356
# via
@@ -412,7 +414,7 @@ onnx==1.14.1
412414
# via
413415
# -r requirements/base.txt
414416
# unstructured-inference
415-
onnxruntime==1.16.0
417+
onnxruntime==1.15.1
416418
# via
417419
# -r requirements/base.txt
418420
# unstructured-inference
@@ -551,9 +553,9 @@ pyparsing==3.1.1
551553
# via
552554
# -r requirements/base.txt
553555
# matplotlib
554-
pypdf==3.16.2
556+
pypdf==3.16.4
555557
# via -r requirements/base.txt
556-
pypdfium2==4.20.0
558+
pypdfium2==4.21.0
557559
# via
558560
# -r requirements/base.txt
559561
# pdfplumber
@@ -576,7 +578,7 @@ python-dateutil==2.8.2
576578
# jupyter-client
577579
# matplotlib
578580
# pandas
579-
python-docx==0.8.11
581+
python-docx==1.0.0
580582
# via
581583
# -r requirements/base.txt
582584
# unstructured
@@ -623,9 +625,10 @@ qtconsole==5.4.4
623625
# via jupyter
624626
qtpy==2.4.0
625627
# via qtconsole
626-
rapidfuzz==3.3.1
628+
rapidfuzz==3.4.0
627629
# via
628630
# -r requirements/base.txt
631+
# unstructured
629632
# unstructured-inference
630633
ratelimit==2.2.1
631634
# via -r requirements/base.txt
@@ -655,7 +658,7 @@ rfc3986-validator==0.1.1
655658
# via
656659
# jsonschema
657660
# jupyter-events
658-
rpds-py==0.10.4
661+
rpds-py==0.10.5
659662
# via
660663
# jsonschema
661664
# referencing
@@ -716,7 +719,7 @@ timm==0.9.7
716719
# effdet
717720
tinycss2==1.2.1
718721
# via nbconvert
719-
tokenizers==0.14.0
722+
tokenizers==0.14.1
720723
# via
721724
# -r requirements/base.txt
722725
# transformers
@@ -789,6 +792,7 @@ typing-extensions==4.8.0
789792
# mypy
790793
# onnx
791794
# pydantic
795+
# python-docx
792796
# torch
793797
# typing-inspect
794798
# uvicorn
@@ -800,9 +804,9 @@ tzdata==2023.3
800804
# via
801805
# -r requirements/base.txt
802806
# pandas
803-
unstructured[local-inference]==0.10.19
807+
unstructured[local-inference]==0.10.21
804808
# via -r requirements/base.txt
805-
unstructured-inference==0.6.6
809+
unstructured-inference==0.7.2
806810
# via
807811
# -r requirements/base.txt
808812
# unstructured
@@ -828,7 +832,7 @@ webencodings==0.5.1
828832
# via
829833
# bleach
830834
# tinycss2
831-
websocket-client==1.6.3
835+
websocket-client==1.6.4
832836
# via jupyter-server
833837
wheel==0.41.2
834838
# via astunparse
@@ -838,7 +842,7 @@ xlrd==2.0.1
838842
# via
839843
# -r requirements/base.txt
840844
# unstructured
841-
xlsxwriter==3.1.6
845+
xlsxwriter==3.1.7
842846
# via
843847
# -r requirements/base.txt
844848
# python-pptx

0 commit comments

Comments
 (0)