Data-Centric-AI-Community · dfgvaetyj3456356-hash · May 28, 2026 · May 29, 2026
diff --git a/src/data_profiling/model/alerts.py b/src/data_profiling/model/alerts.py
@@ -7,6 +7,8 @@
 import numpy as np
 import pandas as pd
 
+import html
+
 from data_profiling.config import Settings
 from data_profiling.model.correlations import perform_check_correlation
 from data_profiling.utils.styles import get_alert_styles
@@ -134,7 +136,8 @@ def fmt(self) -> str:
             num = len(self.values["fields"])
             title = ", ".join(self.values["fields"])
             corr = self.values["corr"]
-            hint = f'data-bs-toggle="tooltip" data-bs-placement="right" data-bs-title="This variable has a high {corr} correlation with {num} fields: {title}"'
+            safe_title = html.escape(title)
+            hint = f'data-bs-toggle="tooltip" data-bs-placement="right" data-bs-title="This variable has a high {corr} correlation with {num} fields: {safe_title}"'
 
         return (
             f'<span class="badge text-bg-{style}" {hint}>{self.alert_type_name}</span>'

diff --git a/src/data_profiling/report/presentation/flavours/html/templates.py b/src/data_profiling/report/presentation/flavours/html/templates.py
@@ -12,7 +12,10 @@
     "data_profiling", "report/presentation/flavours/html/templates"
 )
 jinja2_env = jinja2.Environment(
-    lstrip_blocks=True, trim_blocks=True, loader=package_loader
+    lstrip_blocks=True,
+    trim_blocks=True,
+    loader=package_loader,
+    autoescape=jinja2.select_autoescape(["html", "xml"]),
 )
 jinja2_env.filters["is_list"] = lambda x: isinstance(x, list)
 jinja2_env.filters["fmt_badge"] = fmt_badge

diff --git a/src/data_profiling/serialize_report.py b/src/data_profiling/serialize_report.py
@@ -43,12 +43,15 @@ def dumps(self) -> bytes:
             ]
         )
 
-    def loads(self, data: bytes) -> Union["ProfileReport", "SerializeReport"]:
+    def loads(
+        self, data: bytes, trusted_source: bool = False
+    ) -> Union["ProfileReport", "SerializeReport"]:
         """
         Deserialize the serialized report
 
         Args:
             data: The bytes of a serialize ProfileReport object.
+            trusted_source: Whether the data comes from a trusted source.
 
         Raises:
             ValueError: if ignore_config is set to False and the configs do not match.
@@ -58,6 +61,14 @@ def loads(self, data: bytes) -> Union["ProfileReport", "SerializeReport"]:
         """
         import pickle
 
+        if not trusted_source:
+            warnings.warn(
+                "Deserializing untrusted data with pickle can lead to remote code execution. "
+                "Only load data from trusted sources or set trusted_source=True if you accept the risk.",
+                RuntimeWarning,
+                stacklevel=2,
+            )
+
         try:
             (
                 df_hash,
@@ -120,6 +131,10 @@ def loads(self, data: bytes) -> Union["ProfileReport", "SerializeReport"]:
     def dump(self, output_file: Union[Path, str]) -> None:
         """
         Dump ProfileReport to file
+
+        Args:
+            output_file: The path to write the serialized report to.
+            trusted_source: Whether the data will be treated as from a trusted source on load.
         """
         if not isinstance(output_file, Path):
             output_file = Path(str(output_file))
@@ -128,16 +143,20 @@ def dump(self, output_file: Union[Path, str]) -> None:
         output_file.write_bytes(self.dumps())
 
     def load(
-        self, load_file: Union[Path, str]
+        self, load_file: Union[Path, str], trusted_source: bool = False
     ) -> Union["ProfileReport", "SerializeReport"]:
         """
         Load ProfileReport from file
 
+        Args:
+            load_file: The path to read the serialized report from.
+            trusted_source: Whether the data comes from a trusted source.
+
         Raises:
              ValueError: if the DataFrame or Config do not match with the current ProfileReport
         """
         if not isinstance(load_file, Path):
             load_file = Path(str(load_file))
 
-        self.loads(load_file.read_bytes())
+        self.loads(load_file.read_bytes(), trusted_source=trusted_source)
         return self
diff --git a/src/data_profiling/utils/cache.py b/src/data_profiling/utils/cache.py
@@ -2,7 +2,10 @@
 import zipfile
 from pathlib import Path
 
-from requests import get as get_file
+from functools import partial
+from requests import get as _get_file
+
+get_file = partial(_get_file, timeout=30)
 
 from data_profiling.utils.paths import get_data_path
 

diff --git a/src/data_profiling/utils/common.py b/src/data_profiling/utils/common.py
@@ -56,8 +56,15 @@ def _copy(self, target):
 
 
 def extract_zip(outfile, effective_path):
+    effective_path = Path(effective_path).resolve()
     try:
         with zipfile.ZipFile(outfile) as z:
+            for member in z.namelist():
+                member_path = (effective_path / member).resolve()
+                if not str(member_path).startswith(str(effective_path) + os.sep):
+                    raise ValueError(
+                        f"Zip file contains unsafe path: {member}"
+                    )
             z.extractall(effective_path)
     except zipfile.BadZipFile as e:
         raise ValueError("Bad zip file") from e
@@ -102,7 +109,7 @@ def analytics_features(
                 f"&dbx={dbx}"
             )
 
-            requests.get(request_message)
+            requests.get(request_message, timeout=30)
 
 
 def is_running_in_databricks():

diff --git a/src/data_profiling/utils/dataframe.py b/src/data_profiling/utils/dataframe.py
@@ -74,12 +74,13 @@ def uncompressed_extension(file_name: Path) -> str:
     )
 
 
-def read_pandas(file_name: Path) -> pd.DataFrame:
+def read_pandas(file_name: Path, trusted_source: bool = False) -> pd.DataFrame:
     """Read DataFrame based on the file extension. This function is used when the file is in a standard format.
     Various file types are supported (.csv, .json, .jsonl, .data, .tsv, .xls, .xlsx, .xpt, .sas7bdat, .parquet)
 
     Args:
         file_name: the file to read
+        trusted_source: Whether the file comes from a trusted source.
 
     Returns:
         DataFrame
@@ -111,6 +112,13 @@ def read_pandas(file_name: Path) -> pd.DataFrame:
     elif extension == ".parquet":
         df = pd.read_parquet(str(file_name))
     elif extension in [".pkl", ".pickle"]:
+        if not trusted_source:
+            warnings.warn(
+                "Loading pickle files from untrusted sources can lead to remote code execution. "
+                "Only load pickle files from trusted sources or set trusted_source=True if you accept the risk.",
+                RuntimeWarning,
+                stacklevel=2,
+            )
         df = pd.read_pickle(str(file_name))
     elif extension == ".tar":
         raise ValueError(