16
16
import hashlib
17
17
import logging
18
18
import math
19
+ import time
19
20
from typing import Any , Literal
20
21
from collections .abc import Callable , Iterable
21
22
30
31
CHARTS_COLORS ,
31
32
CHARTS_FONTS ,
32
33
EMPTY_BIN ,
34
+ MAX_TRIVARIATES ,
33
35
NA_BIN ,
34
36
MIN_RARE_CAT_PROTECTION ,
35
37
OTHER_BIN ,
@@ -58,9 +60,9 @@ def calculate_univariates(
58
60
"""
59
61
Calculates univariate accuracies for all target columns.
60
62
"""
61
- _LOG .info ("calculate univariates" )
62
-
63
+ t0 = time .time ()
63
64
tgt_cols = [c for c in ori_bin .columns if c .startswith (TGT_COLUMN )]
65
+
64
66
accuracies = pd .DataFrame ({"column" : tgt_cols })
65
67
with parallel_config ("loky" , n_jobs = min (16 , max (1 , cpu_count () - 1 ))):
66
68
results = Parallel ()(
@@ -71,6 +73,9 @@ def calculate_univariates(
71
73
for _ , row in accuracies .iterrows ()
72
74
)
73
75
accuracies ["accuracy" ], accuracies ["accuracy_max" ] = zip (* results )
76
+
77
+ _LOG .info (f"calculated univariates for { len (tgt_cols )} columns in { time .time () - t0 :.2f} seconds" )
78
+
74
79
return accuracies
75
80
76
81
@@ -87,7 +92,7 @@ def calculate_bivariates(
87
92
For each such column pair, value pair frequencies
88
93
are calculated both for training and synthetic data.
89
94
"""
90
- _LOG . info ( "calculate bivariates" )
95
+ t0 = time . time ( )
91
96
92
97
# the result for symmetric pairs is the same, so we only calculate one of them
93
98
# later, we append copy results for symmetric pairs
@@ -107,7 +112,6 @@ def calculate_bivariates(
107
112
else :
108
113
# enforce consistent columns
109
114
accuracies [["accuracy" , "accuracy_max" ]] = None
110
- # ensure required number of progress messages are sent
111
115
112
116
accuracies = pd .concat (
113
117
[
@@ -117,6 +121,8 @@ def calculate_bivariates(
117
121
axis = 0 ,
118
122
).reset_index (drop = True )
119
123
124
+ _LOG .info (f"calculated bivariate accuracies for { len (accuracies )} combinations in { time .time () - t0 :.2f} seconds" )
125
+
120
126
return accuracies
121
127
122
128
@@ -169,6 +175,49 @@ def calculate_bivariate_columns(ori_bin: pd.DataFrame, append_symetric: bool = T
169
175
return columns_df
170
176
171
177
178
+ def calculate_trivariates (ori_bin : pd .DataFrame , syn_bin : pd .DataFrame ) -> pd .DataFrame :
179
+ """
180
+ Calculates trivariate accuracies.
181
+ """
182
+ t0 = time .time ()
183
+
184
+ accuracies = calculate_trivariate_columns (ori_bin )
185
+
186
+ # calculate trivariates if there is at least one pair
187
+ if len (accuracies ) > 0 :
188
+ with parallel_config ("loky" , n_jobs = min (16 , max (1 , cpu_count () - 1 ))):
189
+ results = Parallel ()(
190
+ delayed (calculate_accuracy )(
191
+ ori_bin_cols = ori_bin [[row ["col1" ], row ["col2" ], row ["col3" ]]],
192
+ syn_bin_cols = syn_bin [[row ["col1" ], row ["col2" ], row ["col3" ]]],
193
+ )
194
+ for _ , row in accuracies .iterrows ()
195
+ )
196
+ accuracies ["accuracy" ], accuracies ["accuracy_max" ] = zip (* results )
197
+ else :
198
+ # enforce consistent columns
199
+ accuracies [["accuracy" , "accuracy_max" ]] = None
200
+
201
+ _LOG .info (f"calculated trivariate accuracies for { len (accuracies )} combinations in { time .time () - t0 :.2f} seconds" )
202
+
203
+ return accuracies
204
+
205
+
206
+ def calculate_trivariate_columns (ori_bin : pd .DataFrame ) -> pd .DataFrame :
207
+ """
208
+ Creates DataFrame with all column-triples subject to trivariate analysis.
209
+ """
210
+ tgt_cols = [c for c in ori_bin .columns if c .startswith (TGT_COLUMN_PREFIX )]
211
+ columns_df = pd .DataFrame ({"col1" : tgt_cols })
212
+ columns_df = pd .merge (columns_df , pd .DataFrame ({"col2" : tgt_cols }), how = "cross" )
213
+ columns_df = pd .merge (columns_df , pd .DataFrame ({"col3" : tgt_cols }), how = "cross" )
214
+ columns_df = columns_df .loc [columns_df .col1 < columns_df .col2 ]
215
+ columns_df = columns_df .loc [columns_df .col1 < columns_df .col3 ]
216
+ columns_df = columns_df .loc [columns_df .col2 < columns_df .col3 ]
217
+ columns_df = columns_df .sample (frac = 1 ).head (n = MAX_TRIVARIATES )
218
+ return columns_df
219
+
220
+
172
221
def calculate_expected_l1_multinomial (probs : list [float ], n_1 : int , n_2 : int ) -> np .float64 :
173
222
"""
174
223
Calculate expected L1 distance for two multinomial samples of size `n_1` and `n_2` that follow `probs`.
@@ -349,7 +398,7 @@ def calculate_bin_counts(
349
398
"""
350
399
Calculates counts of unique values in each bin.
351
400
"""
352
- _LOG . info ( "calculate bin counts" )
401
+ t0 = time . time ( )
353
402
with parallel_config ("loky" , n_jobs = min (16 , max (1 , cpu_count () - 1 ))):
354
403
results = Parallel ()(
355
404
delayed (bin_count_uni )(
@@ -359,8 +408,10 @@ def calculate_bin_counts(
359
408
for col , values in binned .items ()
360
409
)
361
410
bin_cnts_uni = dict (results )
411
+ _LOG .info (f"calculated univariate bin counts for { len (binned .columns )} columns in { time .time () - t0 :.2f} seconds" )
362
412
363
- biv_cols = calculate_bivariate_columns (binned )
413
+ t0 = time .time ()
414
+ biv_cols = calculate_bivariate_columns (binned , append_symetric = True )
364
415
with parallel_config ("loky" , n_jobs = min (16 , max (1 , cpu_count () - 1 ))):
365
416
results = Parallel ()(
366
417
delayed (bin_count_biv )(
@@ -372,6 +423,7 @@ def calculate_bin_counts(
372
423
for _ , row in biv_cols .iterrows ()
373
424
)
374
425
bin_cnts_biv = dict (results )
426
+ _LOG .info (f"calculated bivariate bin counts for { len (biv_cols )} combinations in { time .time () - t0 :.2f} seconds" )
375
427
376
428
return bin_cnts_uni , bin_cnts_biv
377
429
0 commit comments