ydataai · fabclmnt · Sep 19, 2025 · Sep 10, 2025 · Sep 10, 2025 · Sep 10, 2025
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       matrix:
         os: [ ubuntu-22.04 ]
-        python-version: ["3.9", "3.10", "3.11", "3.12" ]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13" ]
         pandas: [ "pandas>1.1" ]
         numpy: [ "numpy>=1.21" ]
     runs-on: ${{ matrix.os }}

diff --git a/pyproject.toml b/pyproject.toml
@@ -11,7 +11,7 @@ package_name = "ydata-profiling"
 
 [project]
 name = "ydata-profiling"
-requires-python = ">=3.7,<3.13"
+requires-python = ">=3.7,<3.14"
 authors = [
     {name = "YData Labs Inc", email = "[email protected]"}
 ]
@@ -51,7 +51,8 @@ dependencies = [
     "numpy>=1.16.0,<2.2",
     # Could be optional
     # Related to HTML report
-    "htmlmin==0.1.12",
+    "minify-html>=0.15.0",
+    "filetype>=1.0.0",
     # Correlations
     "phik>=0.11.1,<0.13",
     # Examples
@@ -108,7 +109,7 @@ notebook = [
 # note that if you are using pyspark 2.3 or 2.4 and pyarrow >= 0.15, you might need to
 # set ARROW_PRE_0_15_IPC_FORMAT=1 in your conf/spark-env.sh for toPandas functions to work properly
 spark = [
-    "pyspark>=3.0",
+    "pyspark>=4.0",
     "pyarrow>=4.0.0",
     "pandas>1.1",
     "numpy>=1.16.0",

diff --git a/src/ydata_profiling/model/pandas/describe_image_pandas.py b/src/ydata_profiling/model/pandas/describe_image_pandas.py
@@ -1,8 +1,8 @@
-import imghdr
 from functools import partial
 from pathlib import Path
 from typing import Optional, Tuple, Union
 
+import filetype
 import imagehash
 import pandas as pd
 from PIL import ExifTags, Image
@@ -12,7 +12,6 @@
     describe_image_1d,
     named_aggregate_summary,
 )
-from ydata_profiling.utils.imghdr_patch import *  # noqa: F401,F403
 
 
 def open_image(path: Path) -> Optional[Image.Image]:
@@ -119,7 +118,8 @@ def extract_exif(image: Image) -> dict:
 
 
 def path_is_image(p: Path) -> bool:
-    return imghdr.what(p) is not None
+    guess = filetype.guess(str(p))
+    return guess is not None and guess.mime.startswith("image/")
 
 
 def count_duplicate_hashes(image_descriptions: dict) -> int:

diff --git a/src/ydata_profiling/model/typeset.py b/src/ydata_profiling/model/typeset.py
@@ -1,11 +1,11 @@
 import datetime
-import imghdr
 import os
 import warnings
 from functools import partial, wraps
 from typing import Callable, Sequence, Set
 from urllib.parse import urlparse
 
+import filetype
 import pandas as pd
 import visions
 from multimethod import multimethod
@@ -295,7 +295,11 @@ def get_relations() -> Sequence[TypeRelation]:
         @multimethod
         @series_handle_nulls
         def contains_op(series: pd.Series, state: dict) -> bool:
-            return all(imghdr.what(p) for p in series)
+            return all(
+                filetype.guess(str(p))
+                and filetype.guess(str(p)).mime.startswith("image/")
+                for p in series
+            )
 
     class TimeSeries(visions.VisionsBaseType):
         @staticmethod

diff --git a/src/ydata_profiling/profile_report.py b/src/ydata_profiling/profile_report.py
@@ -427,9 +427,9 @@ def _render_html(self) -> str:
             )
 
             if self.config.html.minify_html:
-                from htmlmin.main import minify
+                import minify_html
 
-                html = minify(html, remove_all_empty_space=True, remove_comments=True)
+                html = minify_html.minify(html, keep_comments=False)
             pbar.update()
         return html
 

diff --git a/src/ydata_profiling/utils/common.py b/src/ydata_profiling/utils/common.py
@@ -8,8 +8,7 @@
 import zipfile
 from datetime import datetime, timedelta
 
-# Monkeypatch bug in imagehdr
-from imghdr import tests
+# Image type detection
 from pathlib import Path
 from typing import Mapping
 
@@ -64,35 +63,6 @@ def extract_zip(outfile, effective_path):
         raise ValueError("Bad zip file") from e
 
 
-def test_jpeg1(h, f):
-    """JPEG data in JFIF format"""
-    if b"JFIF" in h[:23]:
-        return "jpeg"
-
-
-JPEG_MARK = (
-    b"\xff\xd8\xff\xdb\x00C\x00\x08\x06\x06"
-    b"\x07\x06\x05\x08\x07\x07\x07\t\t\x08\n\x0c\x14\r\x0c\x0b\x0b\x0c\x19\x12\x13\x0f"
-)
-
-
-def test_jpeg2(h, f):
-    """JPEG with small header"""
-    if len(h) >= 32 and h[5] == 67 and h[:32] == JPEG_MARK:
-        return "jpeg"
-
-
-def test_jpeg3(h, f):
-    """JPEG data in JFIF or Exif format"""
-    if h[6:10] in (b"JFIF", b"Exif") or h[:2] == b"\xff\xd8":
-        return "jpeg"
-
-
-tests.append(test_jpeg1)
-tests.append(test_jpeg2)
-tests.append(test_jpeg3)
-
-
 def convert_timestamp_to_datetime(timestamp: int) -> datetime:
     if timestamp >= 0:
         return datetime.fromtimestamp(timestamp)

diff --git a/src/ydata_profiling/utils/imghdr_patch.py b/src/ydata_profiling/utils/imghdr_patch.py
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -83,7 +83,12 @@ def spark_context():
     if not has_spark:
         pytest.skip("Skipping Spark tests because PySpark is not installed.")
 
-    conf = SparkConf().setAppName("pytest-pyspark-tests").setMaster("local[*]")
+    conf = (
+        SparkConf()
+        .setAppName("pytest-pyspark-tests")
+        .setMaster("local[*]")
+        .set("spark.sql.ansi.enabled", "false")
+    )
 
     # Check if SparkContext exists before creating a new one
     if SparkContext._active_spark_context:

diff --git a/tests/issues/test_issue147.py b/tests/issues/test_issue147.py
@@ -19,4 +19,4 @@ def test_issue147(get_data_file):
     )
     html = report.to_html()
     assert type(html) == str
-    assert "Dataset statistics</p>" in html
+    assert "Dataset statistics" in html
diff --git a/tests/issues/test_issue169.py b/tests/issues/test_issue169.py
@@ -32,7 +32,7 @@ def test_issue_169_column(issue_169_data):
     )
     html = report.to_html()
     assert type(html) == str
-    assert "Dataset statistics</p>" in html
+    assert "Dataset statistics" in html
 
 
 def test_issue_169_index(issue_169_data):
@@ -45,4 +45,4 @@ def test_issue_169_index(issue_169_data):
     )
     html = report.to_html()
     assert type(html) == str
-    assert "Dataset statistics</p>" in html
+    assert "Dataset statistics" in html
diff --git a/tests/unit/test_dataset_schema.py b/tests/unit/test_dataset_schema.py
@@ -28,7 +28,7 @@ def test_dataset_schema():
     assert ">Dataset<" in html
     for key in metadata.keys():
         if not key.startswith("copyright_") and key != "url":
-            assert f"<th>{key.capitalize()}</th>" in html
+            assert f"<th>{key.capitalize()}<td" in html
     assert "<tr><th>Copyright</th><td>(c) RandoCorp LLC 2020</td></tr>"
     assert '<tr><th>URL</th><td><a href="http://www.dataset-sources.com/data/dataset.dat">http://www.dataset-sources.com/data/dataset.dat</a></td></tr>'
     assert ">Reproduction<" in html

diff --git a/tests/unit/test_time_series.py b/tests/unit/test_time_series.py
@@ -49,10 +49,10 @@ def sample_ts_df():
 
 
 def test_timeseries_identification(html_profile: str):
-    assert "<th>TimeSeries</th>" in html_profile, "TimeSeries not detected"
+
+    assert "<th>TimeSeries<td" in html_profile, "TimeSeries not detected"
     assert (
-        '<tr><th>TimeSeries</th><td style="white-space: nowrap;">8</td></tr>'
-        in html_profile
+        'TimeSeries<td style="white-space: nowrap;">8' in html_profile
     ), "TimeSeries incorrectly identified"
 
 

diff --git a/tests/unit/test_url.py b/tests/unit/test_url.py
@@ -25,4 +25,4 @@ def test_urls(get_data_file):
     )
 
     assert "URL</span>" in profile.to_html(), "URL not detected"
-    assert "URL</th>" in profile.to_html(), "URL not detected"
+    assert "<th>URL<td" in profile.to_html(), "URL not detected"