Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/data_profiling/model/alerts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import numpy as np
import pandas as pd

import html

from data_profiling.config import Settings
from data_profiling.model.correlations import perform_check_correlation
from data_profiling.utils.styles import get_alert_styles
Expand Down Expand Up @@ -134,7 +136,8 @@ def fmt(self) -> str:
num = len(self.values["fields"])
title = ", ".join(self.values["fields"])
corr = self.values["corr"]
hint = f'data-bs-toggle="tooltip" data-bs-placement="right" data-bs-title="This variable has a high {corr} correlation with {num} fields: {title}"'
safe_title = html.escape(title)
hint = f'data-bs-toggle="tooltip" data-bs-placement="right" data-bs-title="This variable has a high {corr} correlation with {num} fields: {safe_title}"'

return (
f'<span class="badge text-bg-{style}" {hint}>{self.alert_type_name}</span>'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
"data_profiling", "report/presentation/flavours/html/templates"
)
jinja2_env = jinja2.Environment(
lstrip_blocks=True, trim_blocks=True, loader=package_loader
lstrip_blocks=True,
trim_blocks=True,
loader=package_loader,
autoescape=jinja2.select_autoescape(["html", "xml"]),
)
jinja2_env.filters["is_list"] = lambda x: isinstance(x, list)
jinja2_env.filters["fmt_badge"] = fmt_badge
Expand Down
25 changes: 22 additions & 3 deletions src/data_profiling/serialize_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,15 @@ def dumps(self) -> bytes:
]
)

def loads(self, data: bytes) -> Union["ProfileReport", "SerializeReport"]:
def loads(
self, data: bytes, trusted_source: bool = False
) -> Union["ProfileReport", "SerializeReport"]:
"""
Deserialize the serialized report

Args:
data: The bytes of a serialize ProfileReport object.
trusted_source: Whether the data comes from a trusted source.

Raises:
ValueError: if ignore_config is set to False and the configs do not match.
Expand All @@ -58,6 +61,14 @@ def loads(self, data: bytes) -> Union["ProfileReport", "SerializeReport"]:
"""
import pickle

if not trusted_source:
warnings.warn(
"Deserializing untrusted data with pickle can lead to remote code execution. "
"Only load data from trusted sources or set trusted_source=True if you accept the risk.",
RuntimeWarning,
stacklevel=2,
)

try:
(
df_hash,
Expand Down Expand Up @@ -120,6 +131,10 @@ def loads(self, data: bytes) -> Union["ProfileReport", "SerializeReport"]:
def dump(self, output_file: Union[Path, str]) -> None:
"""
Dump ProfileReport to file

Args:
output_file: The path to write the serialized report to.
trusted_source: Whether the data will be treated as from a trusted source on load.
"""
if not isinstance(output_file, Path):
output_file = Path(str(output_file))
Expand All @@ -128,16 +143,20 @@ def dump(self, output_file: Union[Path, str]) -> None:
output_file.write_bytes(self.dumps())

def load(
self, load_file: Union[Path, str]
self, load_file: Union[Path, str], trusted_source: bool = False
) -> Union["ProfileReport", "SerializeReport"]:
"""
Load ProfileReport from file

Args:
load_file: The path to read the serialized report from.
trusted_source: Whether the data comes from a trusted source.

Raises:
ValueError: if the DataFrame or Config do not match with the current ProfileReport
"""
if not isinstance(load_file, Path):
load_file = Path(str(load_file))

self.loads(load_file.read_bytes())
self.loads(load_file.read_bytes(), trusted_source=trusted_source)
return self
5 changes: 4 additions & 1 deletion src/data_profiling/utils/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
import zipfile
from pathlib import Path

from requests import get as get_file
from functools import partial
from requests import get as _get_file

get_file = partial(_get_file, timeout=30)

from data_profiling.utils.paths import get_data_path

Expand Down
9 changes: 8 additions & 1 deletion src/data_profiling/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,15 @@ def _copy(self, target):


def extract_zip(outfile, effective_path):
effective_path = Path(effective_path).resolve()
try:
with zipfile.ZipFile(outfile) as z:
for member in z.namelist():
member_path = (effective_path / member).resolve()
if not str(member_path).startswith(str(effective_path) + os.sep):
raise ValueError(
f"Zip file contains unsafe path: {member}"
)
z.extractall(effective_path)
except zipfile.BadZipFile as e:
raise ValueError("Bad zip file") from e
Expand Down Expand Up @@ -102,7 +109,7 @@ def analytics_features(
f"&dbx={dbx}"
)

requests.get(request_message)
requests.get(request_message, timeout=30)


def is_running_in_databricks():
Expand Down
10 changes: 9 additions & 1 deletion src/data_profiling/utils/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,13 @@ def uncompressed_extension(file_name: Path) -> str:
)


def read_pandas(file_name: Path) -> pd.DataFrame:
def read_pandas(file_name: Path, trusted_source: bool = False) -> pd.DataFrame:
"""Read DataFrame based on the file extension. This function is used when the file is in a standard format.
Various file types are supported (.csv, .json, .jsonl, .data, .tsv, .xls, .xlsx, .xpt, .sas7bdat, .parquet)

Args:
file_name: the file to read
trusted_source: Whether the file comes from a trusted source.

Returns:
DataFrame
Expand Down Expand Up @@ -111,6 +112,13 @@ def read_pandas(file_name: Path) -> pd.DataFrame:
elif extension == ".parquet":
df = pd.read_parquet(str(file_name))
elif extension in [".pkl", ".pickle"]:
if not trusted_source:
warnings.warn(
"Loading pickle files from untrusted sources can lead to remote code execution. "
"Only load pickle files from trusted sources or set trusted_source=True if you accept the risk.",
RuntimeWarning,
stacklevel=2,
)
df = pd.read_pickle(str(file_name))
elif extension == ".tar":
raise ValueError(
Expand Down