-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Expand file tree
/
Copy pathserialize_report.py
More file actions
162 lines (135 loc) · 5.39 KB
/
serialize_report.py
File metadata and controls
162 lines (135 loc) · 5.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import warnings
from pathlib import Path
from typing import TYPE_CHECKING, Optional, Union
if TYPE_CHECKING:
from data_profiling.profile_report import ProfileReport
from data_profiling.config import Settings
from data_profiling.model import BaseDescription
from data_profiling.report.presentation.core import Root
from data_profiling.version import __version__
class SerializeReport:
"""Extend the report to be able to dump and load reports."""
df = None
config = None
_df_hash: Optional[str] = None
_report = None
_description_set = None
@property
def df_hash(self) -> Optional[str]:
return None
def dumps(self) -> bytes:
"""
Serialize ProfileReport and return bytes for reproducing ProfileReport or Caching.
Returns:
Bytes which contains hash of DataFrame, config, _description_set and _report
"""
import pickle
# Note: _description_set and _report may are None if they haven't been computed
return pickle.dumps(
[
self.df_hash,
self.config,
self._description_set,
self._report,
]
)
def loads(
self, data: bytes, trusted_source: bool = False
) -> Union["ProfileReport", "SerializeReport"]:
"""
Deserialize the serialized report
Args:
data: The bytes of a serialize ProfileReport object.
trusted_source: Whether the data comes from a trusted source.
Raises:
ValueError: if ignore_config is set to False and the configs do not match.
Returns:
self
"""
import pickle
if not trusted_source:
warnings.warn(
"Deserializing untrusted data with pickle can lead to remote code execution. "
"Only load data from trusted sources or set trusted_source=True if you accept the risk.",
RuntimeWarning,
stacklevel=2,
)
try:
(
df_hash,
loaded_config,
loaded_description_set,
loaded_report,
) = pickle.loads(data)
except Exception as e:
raise ValueError("Failed to load data") from e
if not all(
(
df_hash is None or isinstance(df_hash, str),
isinstance(loaded_config, Settings),
loaded_description_set is None
or isinstance(loaded_description_set, BaseDescription),
loaded_report is None or isinstance(loaded_report, Root),
)
):
raise ValueError(
"Failed to load data: file may be damaged or from an incompatible version"
)
if (df_hash == self.df_hash) or (self.df is None):
# load to an empty ProfileReport
# Set description_set, report, sample if they are None,or raise an warning.
if self._description_set is None:
self._description_set = loaded_description_set
else:
warnings.warn(
"The description set of current ProfileReport is not None. It won't be loaded."
)
if self._report is None:
self._report = loaded_report
else:
warnings.warn(
"The report of current ProfileReport is not None. It won't be loaded."
)
# overwrite config
self.config = loaded_config
# warn if version not equal
if (
loaded_description_set is not None
and loaded_description_set.package["data_profiling_version"]
!= __version__
):
warnings.warn(
f"The package version specified in the loaded data is not equal to the version installed. "
f"Currently running on data-profiling {__version__} , while loaded data is generated by data_profiling, {loaded_description_set.package['data_profiling_version']}."
)
# set df_hash
self._df_hash = df_hash
else:
raise ValueError("DataFrame does not match with the current ProfileReport.")
return self
def dump(self, output_file: Union[Path, str]) -> None:
"""
Dump ProfileReport to file
Args:
output_file: The path to write the serialized report to.
trusted_source: Whether the data will be treated as from a trusted source on load.
"""
if not isinstance(output_file, Path):
output_file = Path(str(output_file))
output_file = output_file.with_suffix(".pp")
output_file.write_bytes(self.dumps())
def load(
self, load_file: Union[Path, str], trusted_source: bool = False
) -> Union["ProfileReport", "SerializeReport"]:
"""
Load ProfileReport from file
Args:
load_file: The path to read the serialized report from.
trusted_source: Whether the data comes from a trusted source.
Raises:
ValueError: if the DataFrame or Config do not match with the current ProfileReport
"""
if not isinstance(load_file, Path):
load_file = Path(str(load_file))
self.loads(load_file.read_bytes(), trusted_source=trusted_source)
return self