Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
e7d785a
fix: use minify-html instead of htmlmin
WilliamJudge94 Sep 10, 2025
edeea7e
feat: updating imghdr to use filetype instead
WilliamJudge94 Sep 10, 2025
bd44fc5
fix: removing old imghdr tests
WilliamJudge94 Sep 10, 2025
4e6e50b
feat: updating numpy to be <3.0.0
WilliamJudge94 Sep 10, 2025
e9a779c
feat: revert back to numpy <2.2
WilliamJudge94 Sep 10, 2025
db8cc79
fix: kwargs difference from minify-html
WilliamJudge94 Sep 10, 2025
61f6bbf
fix: change html search param for valid output
WilliamJudge94 Sep 10, 2025
bc4d92b
fix: updating html checks due to minify-html
WilliamJudge94 Sep 10, 2025
01afb06
feat: adding python 3.13 to tests
WilliamJudge94 Sep 10, 2025
3adb133
style: removing unused filetype import
WilliamJudge94 Sep 10, 2025
778d0aa
fix: updating required python <3.14
WilliamJudge94 Sep 11, 2025
712732a
style: make sure precommit fmt passes
WilliamJudge94 Sep 17, 2025
85ea240
fix: updating meteorite url
WilliamJudge94 Sep 17, 2025
722a998
fix: updating pyspark version for python 3.13
WilliamJudge94 Sep 17, 2025
37bfc96
fix: loosen dataset schema HTML check for minified output
WilliamJudge94 Sep 17, 2025
2e9c3fc
style: correct fmt
WilliamJudge94 Sep 17, 2025
b56f442
style: conform to repo format
WilliamJudge94 Sep 18, 2025
c8208bf
fix: updating url search for minify-html
WilliamJudge94 Sep 18, 2025
a4ecc98
fix: disable ansi for spark
WilliamJudge94 Sep 18, 2025
7dfdf6d
Merge branch 'develop' into develop
fabclmnt Sep 19, 2025
19a8f62
Merge branch 'develop' into develop
fabclmnt Sep 19, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
strategy:
matrix:
os: [ ubuntu-22.04 ]
python-version: ["3.9", "3.10", "3.11", "3.12" ]
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13" ]
pandas: [ "pandas>1.1" ]
numpy: [ "numpy>=1.21" ]
runs-on: ${{ matrix.os }}
Expand Down
7 changes: 4 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ package_name = "ydata-profiling"

[project]
name = "ydata-profiling"
requires-python = ">=3.7,<3.13"
requires-python = ">=3.7,<3.14"
authors = [
{name = "YData Labs Inc", email = "[email protected]"}
]
Expand Down Expand Up @@ -51,7 +51,8 @@ dependencies = [
"numpy>=1.16.0,<2.2",
# Could be optional
# Related to HTML report
"htmlmin==0.1.12",
"minify-html>=0.15.0",
"filetype>=1.0.0",
# Correlations
"phik>=0.11.1,<0.13",
# Examples
Expand Down Expand Up @@ -108,7 +109,7 @@ notebook = [
# note that if you are using pyspark 2.3 or 2.4 and pyarrow >= 0.15, you might need to
# set ARROW_PRE_0_15_IPC_FORMAT=1 in your conf/spark-env.sh for toPandas functions to work properly
spark = [
"pyspark>=3.0",
"pyspark>=4.0",
"pyarrow>=4.0.0",
"pandas>1.1",
"numpy>=1.16.0",
Expand Down
6 changes: 3 additions & 3 deletions src/ydata_profiling/model/pandas/describe_image_pandas.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import imghdr
from functools import partial
from pathlib import Path
from typing import Optional, Tuple, Union

import filetype
import imagehash
import pandas as pd
from PIL import ExifTags, Image
Expand All @@ -12,7 +12,6 @@
describe_image_1d,
named_aggregate_summary,
)
from ydata_profiling.utils.imghdr_patch import * # noqa: F401,F403


def open_image(path: Path) -> Optional[Image.Image]:
Expand Down Expand Up @@ -119,7 +118,8 @@ def extract_exif(image: Image) -> dict:


def path_is_image(p: Path) -> bool:
return imghdr.what(p) is not None
guess = filetype.guess(str(p))
return guess is not None and guess.mime.startswith("image/")


def count_duplicate_hashes(image_descriptions: dict) -> int:
Expand Down
8 changes: 6 additions & 2 deletions src/ydata_profiling/model/typeset.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import datetime
import imghdr
import os
import warnings
from functools import partial, wraps
from typing import Callable, Sequence, Set
from urllib.parse import urlparse

import filetype
import pandas as pd
import visions
from multimethod import multimethod
Expand Down Expand Up @@ -295,7 +295,11 @@ def get_relations() -> Sequence[TypeRelation]:
@multimethod
@series_handle_nulls
def contains_op(series: pd.Series, state: dict) -> bool:
return all(imghdr.what(p) for p in series)
return all(
filetype.guess(str(p))
and filetype.guess(str(p)).mime.startswith("image/")
for p in series
)

class TimeSeries(visions.VisionsBaseType):
@staticmethod
Expand Down
4 changes: 2 additions & 2 deletions src/ydata_profiling/profile_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,9 +427,9 @@ def _render_html(self) -> str:
)

if self.config.html.minify_html:
from htmlmin.main import minify
import minify_html

html = minify(html, remove_all_empty_space=True, remove_comments=True)
html = minify_html.minify(html, keep_comments=False)
pbar.update()
return html

Expand Down
32 changes: 1 addition & 31 deletions src/ydata_profiling/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
import zipfile
from datetime import datetime, timedelta

# Monkeypatch bug in imagehdr
from imghdr import tests
# Image type detection
from pathlib import Path
from typing import Mapping

Expand Down Expand Up @@ -64,35 +63,6 @@ def extract_zip(outfile, effective_path):
raise ValueError("Bad zip file") from e


def test_jpeg1(h, f):
"""JPEG data in JFIF format"""
if b"JFIF" in h[:23]:
return "jpeg"


JPEG_MARK = (
b"\xff\xd8\xff\xdb\x00C\x00\x08\x06\x06"
b"\x07\x06\x05\x08\x07\x07\x07\t\t\x08\n\x0c\x14\r\x0c\x0b\x0b\x0c\x19\x12\x13\x0f"
)


def test_jpeg2(h, f):
"""JPEG with small header"""
if len(h) >= 32 and h[5] == 67 and h[:32] == JPEG_MARK:
return "jpeg"


def test_jpeg3(h, f):
"""JPEG data in JFIF or Exif format"""
if h[6:10] in (b"JFIF", b"Exif") or h[:2] == b"\xff\xd8":
return "jpeg"


tests.append(test_jpeg1)
tests.append(test_jpeg2)
tests.append(test_jpeg3)


def convert_timestamp_to_datetime(timestamp: int) -> datetime:
if timestamp >= 0:
return datetime.fromtimestamp(timestamp)
Expand Down
31 changes: 0 additions & 31 deletions src/ydata_profiling/utils/imghdr_patch.py

This file was deleted.

7 changes: 6 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,12 @@ def spark_context():
if not has_spark:
pytest.skip("Skipping Spark tests because PySpark is not installed.")

conf = SparkConf().setAppName("pytest-pyspark-tests").setMaster("local[*]")
conf = (
SparkConf()
.setAppName("pytest-pyspark-tests")
.setMaster("local[*]")
.set("spark.sql.ansi.enabled", "false")
)

# Check if SparkContext exists before creating a new one
if SparkContext._active_spark_context:
Expand Down
2 changes: 1 addition & 1 deletion tests/issues/test_issue147.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ def test_issue147(get_data_file):
)
html = report.to_html()
assert type(html) == str
assert "Dataset statistics</p>" in html
assert "Dataset statistics" in html
4 changes: 2 additions & 2 deletions tests/issues/test_issue169.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def test_issue_169_column(issue_169_data):
)
html = report.to_html()
assert type(html) == str
assert "Dataset statistics</p>" in html
assert "Dataset statistics" in html


def test_issue_169_index(issue_169_data):
Expand All @@ -45,4 +45,4 @@ def test_issue_169_index(issue_169_data):
)
html = report.to_html()
assert type(html) == str
assert "Dataset statistics</p>" in html
assert "Dataset statistics" in html
2 changes: 1 addition & 1 deletion tests/unit/test_dataset_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def test_dataset_schema():
assert ">Dataset<" in html
for key in metadata.keys():
if not key.startswith("copyright_") and key != "url":
assert f"<th>{key.capitalize()}</th>" in html
assert f"<th>{key.capitalize()}<td" in html
assert "<tr><th>Copyright</th><td>(c) RandoCorp LLC 2020</td></tr>"
assert '<tr><th>URL</th><td><a href="http://www.dataset-sources.com/data/dataset.dat">http://www.dataset-sources.com/data/dataset.dat</a></td></tr>'
assert ">Reproduction<" in html
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/test_time_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,10 @@ def sample_ts_df():


def test_timeseries_identification(html_profile: str):
assert "<th>TimeSeries</th>" in html_profile, "TimeSeries not detected"

assert "<th>TimeSeries<td" in html_profile, "TimeSeries not detected"
assert (
'<tr><th>TimeSeries</th><td style="white-space: nowrap;">8</td></tr>'
in html_profile
'TimeSeries<td style="white-space: nowrap;">8' in html_profile
), "TimeSeries incorrectly identified"


Expand Down
2 changes: 1 addition & 1 deletion tests/unit/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@ def test_urls(get_data_file):
)

assert "URL</span>" in profile.to_html(), "URL not detected"
assert "URL</th>" in profile.to_html(), "URL not detected"
assert "<th>URL<td" in profile.to_html(), "URL not detected"