From 0bec9a59f42d2fcda11187b31da62d3964a1d1f8 Mon Sep 17 00:00:00 2001 From: Paul Tsekpo Date: Mon, 1 Jun 2026 16:56:45 -0400 Subject: [PATCH 1/3] validating no zip bombs in zip file --- .../src/markitdown/converters/_zip_converter.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/packages/markitdown/src/markitdown/converters/_zip_converter.py b/packages/markitdown/src/markitdown/converters/_zip_converter.py index f87e6c890..0291603b0 100644 --- a/packages/markitdown/src/markitdown/converters/_zip_converter.py +++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py @@ -4,6 +4,9 @@ from typing import BinaryIO, Any, TYPE_CHECKING +from packages.markitdown.src.markitdown._exceptions import FileConversionException +from packages.markitdown.src.markitdown.converters._zip_converter import ( + ZIP_UNCOMPRESSED_SIZE_THRESHOLD) from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from .._exceptions import UnsupportedFormatException, FileConversionException @@ -18,6 +21,7 @@ ACCEPTED_FILE_EXTENSIONS = [".zip"] +ZIP_UNCOMPRESSED_SIZE_THRESHOLD = 500 class ZipConverter(DocumentConverter): """Converts ZIP files to markdown by extracting and converting all contained files. @@ -93,7 +97,20 @@ def convert( file_path = stream_info.url or stream_info.local_path or stream_info.filename md_content = f"Content from the zip file `{file_path}`:\n\n" +#files in zip more than 10000 files +#zips in zips dont recurse forever , depth counter/limit + + with zipfile.ZipFile(file_stream, "r") as zipObj: + track_uncompressed = 0 + for file in zipObj.infolist(): + track_uncompressed += file.file_size + #check for zip bomb + if track_uncompressed > ZIP_UNCOMPRESSED_SIZE_THRESHOLD: + raise FileConversionException(message= "total zip uncmpressed exceeds compressed by threshold") + + + for name in zipObj.namelist(): try: z_file_stream = io.BytesIO(zipObj.read(name)) From a205813d34f338e5d794ecbb41cbcfbf05aec030 Mon Sep 17 00:00:00 2001 From: Paul Tsekpo Date: Mon, 1 Jun 2026 18:53:13 -0400 Subject: [PATCH 2/3] validating no zip bombs in zip file --- .../markitdown/src/markitdown/converters/_zip_converter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/markitdown/src/markitdown/converters/_zip_converter.py b/packages/markitdown/src/markitdown/converters/_zip_converter.py index 0291603b0..6875f8bcc 100644 --- a/packages/markitdown/src/markitdown/converters/_zip_converter.py +++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py @@ -25,6 +25,8 @@ class ZipConverter(DocumentConverter): """Converts ZIP files to markdown by extracting and converting all contained files. + + Before extracting markdown validate no zip bomb exist in zip file The converter extracts the ZIP contents to a temporary directory, processes each file using appropriate converters based on file extensions, and then combines the results @@ -107,7 +109,7 @@ def convert( track_uncompressed += file.file_size #check for zip bomb if track_uncompressed > ZIP_UNCOMPRESSED_SIZE_THRESHOLD: - raise FileConversionException(message= "total zip uncmpressed exceeds compressed by threshold") + raise FileConversionException(message= "total zip uncompressed exceeds compressed by threshold") From 8c0950f51af7347a8184dc513aeb9779b73fb4c2 Mon Sep 17 00:00:00 2001 From: Paul Tsekpo Date: Mon, 1 Jun 2026 19:27:07 -0400 Subject: [PATCH 3/3] validating no zip bombs in zip file with exceptions and thresholds --- .../markitdown/converters/_zip_converter.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_zip_converter.py b/packages/markitdown/src/markitdown/converters/_zip_converter.py index 6875f8bcc..ca2bb19e2 100644 --- a/packages/markitdown/src/markitdown/converters/_zip_converter.py +++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py @@ -4,9 +4,6 @@ from typing import BinaryIO, Any, TYPE_CHECKING -from packages.markitdown.src.markitdown._exceptions import FileConversionException -from packages.markitdown.src.markitdown.converters._zip_converter import ( - ZIP_UNCOMPRESSED_SIZE_THRESHOLD) from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from .._exceptions import UnsupportedFormatException, FileConversionException @@ -21,7 +18,9 @@ ACCEPTED_FILE_EXTENSIONS = [".zip"] -ZIP_UNCOMPRESSED_SIZE_THRESHOLD = 500 +ZIP_UNCOMPRESSED_SIZE_THRESHOLD = 100 * 1024 * 1024 # 100 MB +MAX_FILE_COUNT = 1000 +MAX_DEPTH = 3 class ZipConverter(DocumentConverter): """Converts ZIP files to markdown by extracting and converting all contained files. @@ -94,17 +93,24 @@ def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, + *, + _depth = 0, **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + #base case recursion + if _depth > MAX_DEPTH: + raise FileConversionException(message="Max zip nesting depth exceeded") + file_path = stream_info.url or stream_info.local_path or stream_info.filename md_content = f"Content from the zip file `{file_path}`:\n\n" -#files in zip more than 10000 files -#zips in zips dont recurse forever , depth counter/limit - - with zipfile.ZipFile(file_stream, "r") as zipObj: track_uncompressed = 0 + + if len(zipObj.namelist()) > MAX_FILE_COUNT: + raise FileConversionException(message="Too many files in zip") + for file in zipObj.infolist(): track_uncompressed += file.file_size #check for zip bomb @@ -123,6 +129,7 @@ def convert( result = self._markitdown.convert_stream( stream=z_file_stream, stream_info=z_file_stream_info, + _depth = _depth+1, ) if result is not None: md_content += f"## File: {name}\n\n"