Skip to content

Commit 5b319c8

Browse files
lu-ohaimrDzurb
andauthored
Fixed ads dataset key error (#1152)
Co-authored-by: Dmitrii Cherkasov <[email protected]>
1 parent e748460 commit 5b319c8

File tree

1 file changed

+32
-34
lines changed

1 file changed

+32
-34
lines changed

ads/type_discovery/typed_feature.py

+32-34
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,27 @@
11
#!/usr/bin/env python
2-
# -*- coding: utf-8; -*-
32

4-
# Copyright (c) 2020, 2022 Oracle and/or its affiliates.
3+
# Copyright (c) 2020, 2025 Oracle and/or its affiliates.
54
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
65

76

8-
from __future__ import print_function, absolute_import
9-
7+
import copy
108
import json
119
import re
12-
import copy
1310
from collections import defaultdict
1411
from time import time
1512

1613
import numpy as np
1714
import pandas as pd
18-
from sklearn.utils import Bunch
1915
from sklearn.feature_extraction.text import CountVectorizer
16+
from sklearn.utils import Bunch
2017

21-
from ads.common import utils, logger
18+
from ads.common import utils
2219
from ads.common.card_identifier import card_identify
23-
from ads.common.utils import JsonConverter
2420
from ads.common.decorator.runtime_dependency import (
25-
runtime_dependency,
2621
OptionalDependency,
22+
runtime_dependency,
2723
)
24+
from ads.common.utils import JsonConverter
2825

2926

3027
class TypedFeature(Bunch):
@@ -97,7 +94,7 @@ def build(name, series):
9794
x_min, x_max = np.nanmin(nulls_removed), np.nanmax(nulls_removed)
9895

9996
stats = {
100-
"unique percentage": 100 * desc["unique"] / desc["count"],
97+
"unique percentage": 100 * series.nunique() / series.size,
10198
"x_min": x_min,
10299
"x_max": x_max,
103100
"mode": series.mode().iloc[0],
@@ -113,13 +110,13 @@ def build(name, series):
113110
"stats": stats,
114111
"internal": {
115112
"sample": series.head(5),
116-
"unique": desc["unique"],
113+
"unique": series.nunique(),
117114
"counts": utils.truncate_series_top_n(
118115
value_counts, n=min(16, len(value_counts))
119116
),
120-
"high_cardinality": bool(desc["unique"] > 30),
117+
"high_cardinality": bool(series.nunique() > 30),
121118
"very_high_cardinality": bool(
122-
desc["unique"] >= 0.95 * desc["count"]
119+
series.nunique() >= 0.95 * series.size
123120
),
124121
},
125122
},
@@ -134,13 +131,14 @@ def __init__(self, name, meta_data):
134131
def build(name, series):
135132
desc = series.astype("category").loc[~series.isnull()].describe(include="all")
136133
value_counts = series.value_counts(ascending=False)
137-
if isinstance(desc["top"], str):
138-
mode = desc["top"] if len(desc["top"]) < 30 else desc["top"][:24] + "..."
134+
top = desc["top"] if "top" in desc else None
135+
if isinstance(top, str):
136+
mode = top if len(top) < 30 else top[:24] + "..."
139137
else:
140-
mode = desc["top"]
138+
mode = top
141139

142140
stats = {
143-
"unique percentage": 100 * desc["unique"] / desc["count"],
141+
"unique percentage": 100 * series.nunique() / series.size,
144142
"mode": mode,
145143
}
146144
stats.update({k: v for k, v in desc.items()})
@@ -154,13 +152,13 @@ def build(name, series):
154152
"stats": stats,
155153
"internal": {
156154
"sample": series.sample(n=min(100, series.size)),
157-
"unique": desc["unique"],
155+
"unique": series.nunique(),
158156
"counts": utils.truncate_series_top_n(
159157
value_counts, n=min(16, len(value_counts))
160158
),
161-
"high_cardinality": bool(desc["unique"] > 30),
159+
"high_cardinality": bool(series.nunique() > 30),
162160
"very_high_cardinality": bool(
163-
desc["unique"] >= 0.95 * desc["count"]
161+
series.nunique() >= 0.95 * series.size
164162
),
165163
},
166164
},
@@ -185,15 +183,15 @@ def build(name, series):
185183
"missing_percentage": 100 * series.isna().sum() / series.size,
186184
"low_level_type": series.dtype.name,
187185
"stats": {
188-
"unique percentage": 100 * desc["unique"] / desc["count"],
186+
"unique percentage": 100 * series.nunique() / series.size,
189187
"mode": series.mode().iloc[0],
190188
},
191189
"internal": {
192190
"sample": nulls_removed.sample(n=min(100, nulls_removed.size)),
193191
"counts": utils.truncate_series_top_n(
194192
value_counts, n=min(16, len(value_counts))
195193
),
196-
"unique": desc["unique"],
194+
"unique": series.nunique(),
197195
},
198196
},
199197
)
@@ -224,15 +222,15 @@ def build(name, series):
224222
"missing_percentage": 100 * series.isna().sum() / series.size,
225223
"low_level_type": series.dtype.name,
226224
"stats": {
227-
"unique percentage": 100 * desc["unique"] / desc["count"],
225+
"unique percentage": 100 * series.nunique() / series.size,
228226
"mode": series.mode().iloc[0],
229227
},
230228
"internal": {
231229
"sample": series.sample(n=min(100, series.size)),
232230
"counts": utils.truncate_series_top_n(
233231
value_counts, n=min(16, len(value_counts))
234232
),
235-
"unique": desc["unique"],
233+
"unique": series.nunique(),
236234
},
237235
},
238236
)
@@ -254,10 +252,10 @@ def build(name, series, samples):
254252
"low_level_type": series.dtype.name,
255253
"stats": {
256254
"observations": desc["count"],
257-
"unique percentage": 100 * desc["unique"] / desc["count"]
255+
"unique percentage": 100 * series.nunique() / series.size,
258256
# TODO mid point
259257
},
260-
"internal": {"sample": samples, "unique": desc["unique"]},
258+
"internal": {"sample": samples, "unique": series.nunique()},
261259
},
262260
)
263261

@@ -556,13 +554,13 @@ def build(name, series):
556554
"missing_percentage": 100 * series.isna().sum() / series.size,
557555
"low_level_type": series.dtype.name,
558556
"stats": {
559-
"unique percentage": 100 * desc["unique"] / desc["count"],
560-
"mode": desc["top"],
557+
"unique percentage": 100 * series.nunique() / series.size,
558+
"mode": desc["top"] if "top" in desc else None,
561559
},
562560
"internal": {
563561
"sample": series.sample(n=min(100, series.size)),
564562
"counts": dict(d_scheme),
565-
"unique": desc["unique"],
563+
"unique": series.nunique(),
566564
},
567565
},
568566
)
@@ -583,14 +581,14 @@ def build(name, series):
583581
"missing_percentage": 100 * series.isna().sum() / series.size,
584582
"low_level_type": series.dtype.name,
585583
"stats": {
586-
"unique percentage": 100 * desc["unique"] / desc["count"],
587-
"first": desc["first"],
588-
"last": desc["last"],
589-
"mode": desc["top"],
584+
"unique percentage": 100 * series.nunique() / series.size,
585+
"first": desc["first"] if "first" in desc else None,
586+
"last": desc["last"] if "last" in desc else None,
587+
"mode": desc["top"] if "top" in desc else None,
590588
},
591589
"internal": {
592590
"sample": series.sample(n=min(100, series.size)),
593-
"unique": desc["unique"],
591+
"unique": series.nunique(),
594592
},
595593
},
596594
)

0 commit comments

Comments
 (0)