Skip to content

Commit 22b6923

Browse files
committed
Refactor histogram bin calculation to respect max_bins
1 parent 628d400 commit 22b6923

2 files changed

Lines changed: 33 additions & 82 deletions

File tree

examples/type_schema.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
3+

src/data_profiling/model/summary_algorithms.py

Lines changed: 30 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,7 @@
1313

1414
def func_nullable_series_contains(fn: Callable) -> Callable:
1515
@functools.wraps(fn)
16-
def inner(
17-
config: Settings, series: pd.Series, state: dict, *args, **kwargs
18-
) -> bool:
16+
def inner(config: Settings, series: pd.Series, state: dict, *args, **kwargs) -> bool:
1917
if series.hasnans:
2018
series = series.dropna()
2119
if series.empty:
@@ -32,18 +30,12 @@ def safe_histogram(
3230
weights: Optional[np.ndarray] = None,
3331
density: bool = False,
3432
) -> Tuple[np.ndarray, np.ndarray]:
35-
"""
36-
Wrapper to avoid
37-
ValueError: Too many bins for data range. Cannot create N finite-sized bins.
38-
"""
3933
try:
4034
return np.histogram(values, bins=bins, weights=weights, density=density)
4135
except ValueError as exc:
4236
if "Too many bins for data range" in str(exc):
4337
try:
44-
return np.histogram(
45-
values, bins="auto", weights=weights, density=density
46-
)
38+
return np.histogram(values, bins="auto", weights=weights, density=density)
4739
except ValueError:
4840
finite = values[np.isfinite(values)]
4941
if finite.size == 0:
@@ -55,9 +47,7 @@ def safe_histogram(
5547
bin_edges = np.array([vmin - eps, vmin + eps])
5648
else:
5749
bin_edges = np.array([vmin, vmax])
58-
return np.histogram(
59-
values, bins=bin_edges, weights=weights, density=density
60-
)
50+
return np.histogram(values, bins=bin_edges, weights=weights, density=density)
6151
raise
6252

6353

@@ -69,18 +59,20 @@ def histogram_compute(
6959
weights: Optional[np.ndarray] = None,
7060
) -> dict:
7161
stats = {}
62+
7263
if len(finite_values) == 0:
7364
return {name: []}
7465

7566
hist_config = config.plot.histogram
7667

77-
# Compute data range
7868
finite = finite_values[np.isfinite(finite_values)]
69+
if len(finite) == 0:
70+
return {name: []}
71+
7972
vmin = float(np.min(finite))
8073
vmax = float(np.max(finite))
8174
data_range = vmax - vmin
8275

83-
# Choose of Bins based on observed data values
8476
if data_range == 0:
8577
eps = 0.5 if vmin == 0 else abs(vmin) * 0.1
8678
bins = np.array([vmin - eps, vmin + eps])
@@ -89,14 +81,13 @@ def histogram_compute(
8981

9082
if isinstance(requested_bins, int):
9183
safe_bins = min(requested_bins, n_unique, hist_config.max_bins)
92-
9384
safe_bins = max(1, safe_bins)
94-
9585
bins = np.linspace(vmin, vmax, safe_bins + 1)
9686
else:
97-
bins = np.histogram_bin_edges(finite_values, bins="auto")
98-
if len(bins) - 1 > hist_config.max_bins:
99-
bins = np.linspace(vmin, vmax, hist_config.max_bins + 1)
87+
bins = np.histogram_bin_edges(
88+
finite_values,
89+
bins=min(len(finite_values), hist_config.max_bins),
90+
)
10091

10192
hist = np.histogram(
10293
finite_values,
@@ -113,16 +104,13 @@ def chi_square(
113104
values: Optional[np.ndarray] = None,
114105
histogram: Optional[np.ndarray] = None,
115106
) -> dict:
116-
# Case 1: histogram not passed → we compute it
117107
if histogram is None:
118108
if values is None:
119109
return {"statistic": 0, "pvalue": 0}
120110

121-
# Try NumPy "auto" binning (may fail under NumPy 2)
122111
try:
123112
bins = np.histogram_bin_edges(values, bins="auto")
124113
except ValueError:
125-
# Fallback: basic 1-bin histogram covering the min→max range
126114
finite = values[np.isfinite(values)]
127115
if finite.size == 0:
128116
return {"statistic": 0, "pvalue": 0}
@@ -136,141 +124,101 @@ def chi_square(
136124

137125
histogram, _ = np.histogram(values, bins=bins)
138126

139-
# Case 2: histogram exists but is empty
140127
if histogram.size == 0 or histogram.sum() == 0:
141128
return {"statistic": 0, "pvalue": 0}
142129

143130
return dict(chisquare(histogram)._asdict())
144131

145132

146-
def series_hashable(
147-
fn: Callable[[Settings, pd.Series, dict], Tuple[Settings, pd.Series, dict]]
148-
) -> Callable[[Settings, pd.Series, dict], Tuple[Settings, pd.Series, dict]]:
133+
def series_hashable(fn):
149134
@functools.wraps(fn)
150-
def inner(
151-
config: Settings, series: pd.Series, summary: dict
152-
) -> Tuple[Settings, pd.Series, dict]:
135+
def inner(config: Settings, series: pd.Series, summary: dict):
153136
if not summary["hashable"]:
154137
return config, series, summary
155138
return fn(config, series, summary)
156139

157140
return inner
158141

159142

160-
def series_handle_nulls(
161-
fn: Callable[[Settings, pd.Series, dict], Tuple[Settings, pd.Series, dict]]
162-
) -> Callable[[Settings, pd.Series, dict], Tuple[Settings, pd.Series, dict]]:
163-
"""Decorator for nullable series"""
164-
143+
def series_handle_nulls(fn):
165144
@functools.wraps(fn)
166-
def inner(
167-
config: Settings, series: pd.Series, summary: dict
168-
) -> Tuple[Settings, pd.Series, dict]:
145+
def inner(config: Settings, series: pd.Series, summary: dict):
169146
if series.hasnans:
170147
series = series.dropna()
171-
172148
return fn(config, series, summary)
173149

174150
return inner
175151

176152

177153
def named_aggregate_summary(series: pd.Series, key: str) -> dict:
178-
summary = {
154+
return {
179155
f"max_{key}": np.max(series),
180156
f"mean_{key}": np.mean(series),
181157
f"median_{key}": np.median(series),
182158
f"min_{key}": np.min(series),
183159
}
184160

185-
return summary
186-
187161

188162
@multimethod
189-
def describe_counts(
190-
config: Settings, series: Any, summary: dict
191-
) -> Tuple[Settings, Any, dict]:
163+
def describe_counts(config: Settings, series: Any, summary: dict):
192164
raise NotImplementedError()
193165

194166

195167
@multimethod
196-
def describe_supported(
197-
config: Settings, series: Any, series_description: dict
198-
) -> Tuple[Settings, Any, dict]:
168+
def describe_supported(config: Settings, series: Any, summary: dict):
199169
raise NotImplementedError()
200170

201171

202172
@multimethod
203-
def describe_generic(
204-
config: Settings, series: Any, summary: dict
205-
) -> Tuple[Settings, Any, dict]:
173+
def describe_generic(config: Settings, series: Any, summary: dict):
206174
raise NotImplementedError()
207175

208176

209177
@multimethod
210-
def describe_numeric_1d(
211-
config: Settings, series: Any, summary: dict
212-
) -> Tuple[Settings, Any, dict]:
178+
def describe_numeric_1d(config: Settings, series: Any, summary: dict):
213179
raise NotImplementedError()
214180

215181

216182
@multimethod
217-
def describe_text_1d(
218-
config: Settings, series: Any, summary: dict
219-
) -> Tuple[Settings, Any, dict, Any]:
183+
def describe_text_1d(config: Settings, series: Any, summary: dict):
220184
raise NotImplementedError()
221185

222186

223187
@multimethod
224-
def describe_date_1d(
225-
config: Settings, series: Any, summary: dict
226-
) -> Tuple[Settings, Any, dict]:
188+
def describe_date_1d(config: Settings, series: Any, summary: dict):
227189
raise NotImplementedError()
228190

229191

230192
@multimethod
231-
def describe_categorical_1d(
232-
config: Settings, series: pd.Series, summary: dict
233-
) -> Tuple[Settings, pd.Series, dict]:
193+
def describe_categorical_1d(config: Settings, series: pd.Series, summary: dict):
234194
raise NotImplementedError()
235195

236196

237197
@multimethod
238-
def describe_url_1d(
239-
config: Settings, series: Any, summary: dict
240-
) -> Tuple[Settings, Any, dict]:
198+
def describe_url_1d(config: Settings, series: Any, summary: dict):
241199
raise NotImplementedError()
242200

243201

244202
@multimethod
245-
def describe_file_1d(
246-
config: Settings, series: Any, summary: dict
247-
) -> Tuple[Settings, Any, dict]:
203+
def describe_file_1d(config: Settings, series: Any, summary: dict):
248204
raise NotImplementedError()
249205

250206

251207
@multimethod
252-
def describe_path_1d(
253-
config: Settings, series: Any, summary: dict
254-
) -> Tuple[Settings, Any, dict]:
208+
def describe_path_1d(config: Settings, series: Any, summary: dict):
255209
raise NotImplementedError()
256210

257211

258212
@multimethod
259-
def describe_image_1d(
260-
config: Settings, series: Any, summary: dict
261-
) -> Tuple[Settings, Any, dict]:
213+
def describe_image_1d(config: Settings, series: Any, summary: dict):
262214
raise NotImplementedError()
263215

264216

265217
@multimethod
266-
def describe_boolean_1d(
267-
config: Settings, series: Any, summary: dict
268-
) -> Tuple[Settings, Any, dict]:
218+
def describe_boolean_1d(config: Settings, series: Any, summary: dict):
269219
raise NotImplementedError()
270220

271221

272222
@multimethod
273-
def describe_timeseries_1d(
274-
config: Settings, series: Any, summary: dict
275-
) -> Tuple[Settings, Any, dict]:
276-
raise NotImplementedError()
223+
def describe_timeseries_1d(config: Settings, series: Any, summary: dict):
224+
raise NotImplementedError()

0 commit comments

Comments
 (0)