1
1
#!/usr/bin/env python
2
- # -*- coding: utf-8; -*-
3
2
4
- # Copyright (c) 2020, 2022 Oracle and/or its affiliates.
3
+ # Copyright (c) 2020, 2025 Oracle and/or its affiliates.
5
4
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
6
5
7
6
8
- from __future__ import print_function , absolute_import
9
-
7
+ import copy
10
8
import json
11
9
import re
12
- import copy
13
10
from collections import defaultdict
14
11
from time import time
15
12
16
13
import numpy as np
17
14
import pandas as pd
18
- from sklearn .utils import Bunch
19
15
from sklearn .feature_extraction .text import CountVectorizer
16
+ from sklearn .utils import Bunch
20
17
21
- from ads .common import utils , logger
18
+ from ads .common import utils
22
19
from ads .common .card_identifier import card_identify
23
- from ads .common .utils import JsonConverter
24
20
from ads .common .decorator .runtime_dependency import (
25
- runtime_dependency ,
26
21
OptionalDependency ,
22
+ runtime_dependency ,
27
23
)
24
+ from ads .common .utils import JsonConverter
28
25
29
26
30
27
class TypedFeature (Bunch ):
@@ -97,7 +94,7 @@ def build(name, series):
97
94
x_min , x_max = np .nanmin (nulls_removed ), np .nanmax (nulls_removed )
98
95
99
96
stats = {
100
- "unique percentage" : 100 * desc [ "unique" ] / desc [ "count" ] ,
97
+ "unique percentage" : 100 * series . nunique () / series . size ,
101
98
"x_min" : x_min ,
102
99
"x_max" : x_max ,
103
100
"mode" : series .mode ().iloc [0 ],
@@ -113,13 +110,13 @@ def build(name, series):
113
110
"stats" : stats ,
114
111
"internal" : {
115
112
"sample" : series .head (5 ),
116
- "unique" : desc [ "unique" ] ,
113
+ "unique" : series . nunique () ,
117
114
"counts" : utils .truncate_series_top_n (
118
115
value_counts , n = min (16 , len (value_counts ))
119
116
),
120
- "high_cardinality" : bool (desc [ "unique" ] > 30 ),
117
+ "high_cardinality" : bool (series . nunique () > 30 ),
121
118
"very_high_cardinality" : bool (
122
- desc [ "unique" ] >= 0.95 * desc [ "count" ]
119
+ series . nunique () >= 0.95 * series . size
123
120
),
124
121
},
125
122
},
@@ -134,13 +131,14 @@ def __init__(self, name, meta_data):
134
131
def build (name , series ):
135
132
desc = series .astype ("category" ).loc [~ series .isnull ()].describe (include = "all" )
136
133
value_counts = series .value_counts (ascending = False )
137
- if isinstance (desc ["top" ], str ):
138
- mode = desc ["top" ] if len (desc ["top" ]) < 30 else desc ["top" ][:24 ] + "..."
134
+ top = desc ["top" ] if "top" in desc else None
135
+ if isinstance (top , str ):
136
+ mode = top if len (top ) < 30 else top [:24 ] + "..."
139
137
else :
140
- mode = desc [ " top" ]
138
+ mode = top
141
139
142
140
stats = {
143
- "unique percentage" : 100 * desc [ "unique" ] / desc [ "count" ] ,
141
+ "unique percentage" : 100 * series . nunique () / series . size ,
144
142
"mode" : mode ,
145
143
}
146
144
stats .update ({k : v for k , v in desc .items ()})
@@ -154,13 +152,13 @@ def build(name, series):
154
152
"stats" : stats ,
155
153
"internal" : {
156
154
"sample" : series .sample (n = min (100 , series .size )),
157
- "unique" : desc [ "unique" ] ,
155
+ "unique" : series . nunique () ,
158
156
"counts" : utils .truncate_series_top_n (
159
157
value_counts , n = min (16 , len (value_counts ))
160
158
),
161
- "high_cardinality" : bool (desc [ "unique" ] > 30 ),
159
+ "high_cardinality" : bool (series . nunique () > 30 ),
162
160
"very_high_cardinality" : bool (
163
- desc [ "unique" ] >= 0.95 * desc [ "count" ]
161
+ series . nunique () >= 0.95 * series . size
164
162
),
165
163
},
166
164
},
@@ -185,15 +183,15 @@ def build(name, series):
185
183
"missing_percentage" : 100 * series .isna ().sum () / series .size ,
186
184
"low_level_type" : series .dtype .name ,
187
185
"stats" : {
188
- "unique percentage" : 100 * desc [ "unique" ] / desc [ "count" ] ,
186
+ "unique percentage" : 100 * series . nunique () / series . size ,
189
187
"mode" : series .mode ().iloc [0 ],
190
188
},
191
189
"internal" : {
192
190
"sample" : nulls_removed .sample (n = min (100 , nulls_removed .size )),
193
191
"counts" : utils .truncate_series_top_n (
194
192
value_counts , n = min (16 , len (value_counts ))
195
193
),
196
- "unique" : desc [ "unique" ] ,
194
+ "unique" : series . nunique () ,
197
195
},
198
196
},
199
197
)
@@ -224,15 +222,15 @@ def build(name, series):
224
222
"missing_percentage" : 100 * series .isna ().sum () / series .size ,
225
223
"low_level_type" : series .dtype .name ,
226
224
"stats" : {
227
- "unique percentage" : 100 * desc [ "unique" ] / desc [ "count" ] ,
225
+ "unique percentage" : 100 * series . nunique () / series . size ,
228
226
"mode" : series .mode ().iloc [0 ],
229
227
},
230
228
"internal" : {
231
229
"sample" : series .sample (n = min (100 , series .size )),
232
230
"counts" : utils .truncate_series_top_n (
233
231
value_counts , n = min (16 , len (value_counts ))
234
232
),
235
- "unique" : desc [ "unique" ] ,
233
+ "unique" : series . nunique () ,
236
234
},
237
235
},
238
236
)
@@ -254,10 +252,10 @@ def build(name, series, samples):
254
252
"low_level_type" : series .dtype .name ,
255
253
"stats" : {
256
254
"observations" : desc ["count" ],
257
- "unique percentage" : 100 * desc [ "unique" ] / desc [ "count" ]
255
+ "unique percentage" : 100 * series . nunique () / series . size ,
258
256
# TODO mid point
259
257
},
260
- "internal" : {"sample" : samples , "unique" : desc [ "unique" ] },
258
+ "internal" : {"sample" : samples , "unique" : series . nunique () },
261
259
},
262
260
)
263
261
@@ -556,13 +554,13 @@ def build(name, series):
556
554
"missing_percentage" : 100 * series .isna ().sum () / series .size ,
557
555
"low_level_type" : series .dtype .name ,
558
556
"stats" : {
559
- "unique percentage" : 100 * desc [ "unique" ] / desc [ "count" ] ,
560
- "mode" : desc ["top" ],
557
+ "unique percentage" : 100 * series . nunique () / series . size ,
558
+ "mode" : desc ["top" ] if "top" in desc else None ,
561
559
},
562
560
"internal" : {
563
561
"sample" : series .sample (n = min (100 , series .size )),
564
562
"counts" : dict (d_scheme ),
565
- "unique" : desc [ "unique" ] ,
563
+ "unique" : series . nunique () ,
566
564
},
567
565
},
568
566
)
@@ -583,14 +581,14 @@ def build(name, series):
583
581
"missing_percentage" : 100 * series .isna ().sum () / series .size ,
584
582
"low_level_type" : series .dtype .name ,
585
583
"stats" : {
586
- "unique percentage" : 100 * desc [ "unique" ] / desc [ "count" ] ,
587
- "first" : desc ["first" ],
588
- "last" : desc ["last" ],
589
- "mode" : desc ["top" ],
584
+ "unique percentage" : 100 * series . nunique () / series . size ,
585
+ "first" : desc ["first" ] if "first" in desc else None ,
586
+ "last" : desc ["last" ] if "last" in desc else None ,
587
+ "mode" : desc ["top" ] if "top" in desc else None ,
590
588
},
591
589
"internal" : {
592
590
"sample" : series .sample (n = min (100 , series .size )),
593
- "unique" : desc [ "unique" ] ,
591
+ "unique" : series . nunique () ,
594
592
},
595
593
},
596
594
)
0 commit comments