Skip to content

Commit 70f02cc

Browse files
committed
I added the feature importance function
1 parent 5027142 commit 70f02cc

File tree

6 files changed

+130
-2
lines changed

6 files changed

+130
-2
lines changed

.idea/eis_toolkit.iml

100755100644
+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/misc.xml

100755100644
+4-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

eis_toolkit/exceptions.py

+8
Original file line numberDiff line numberDiff line change
@@ -76,3 +76,11 @@ class NonSquarePixelSizeException(Exception):
7676

7777
class NumericValueSignException(Exception):
7878
"""Exception error class for numeric value sign exception."""
79+
80+
81+
class InvalidModelException(Exception):
82+
"""Exception error class when model is invalid or null."""
83+
84+
85+
class InvalidDatasetException(Exception):
86+
"""Exception error class when the dataset is null."""

eis_toolkit/feature_importance/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import numpy as np
2+
import pandas
3+
import pandas as pd
4+
import sklearn.neural_network
5+
from sklearn.inspection import permutation_importance
6+
7+
from eis_toolkit.exceptions import InvalidDatasetException
8+
9+
10+
def evaluate_feature_importance(
11+
clf: sklearn.neural_network or sklearn.linear_model,
12+
x_test: np.ndarray,
13+
y_test: np.ndarray,
14+
feature_names: list[str],
15+
number_of_repetition: int = 50,
16+
random_state: int = 0,
17+
) -> (pandas.DataFrame, dict):
18+
"""
19+
Evaluate the feature importance of a sklearn classifier or linear model.
20+
21+
Parameters:
22+
clf (Any sklearn nn model or lm model): Trained classifier.
23+
x_test (np.ndarray): Testing feature data (X data need to be normalized / standardized).
24+
y_test (np.ndarray): Testing target data.
25+
feature_names (list): Names of the feature columns.
26+
number_of_repetition (int): Number of iteration used when calculate feature importance (default 50).
27+
random_state (int): random state for repeatability of results (Default 0).
28+
Return:
29+
feature_importance (pd.Dataframe): A dataframe composed by features name and Importance value
30+
result (dict[object]): The resulted object with importance mean, importance std, and overall importance
31+
Raise:
32+
InvalidDatasetException: When the dataset is None.
33+
"""
34+
35+
if x_test is None or y_test is None:
36+
raise InvalidDatasetException
37+
38+
result = permutation_importance(
39+
clf, x_test, y_test.ravel(), n_repeats=number_of_repetition, random_state=random_state
40+
)
41+
42+
feature_importance = pd.DataFrame({"Feature": feature_names, "Importance": result.importances_mean})
43+
44+
feature_importance["Importance"] = feature_importance["Importance"] * 100
45+
feature_importance = feature_importance.sort_values(by="Importance", ascending=False)
46+
# feature_importance['Importance'] = feature_importance['Importance'].apply(lambda x: '{:.6f}%'.format(x))
47+
48+
return feature_importance, result
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import matplotlib.pyplot as plt
2+
import pandas as pd
3+
from sklearn.model_selection import train_test_split
4+
from sklearn.neural_network import MLPClassifier
5+
from sklearn.preprocessing import StandardScaler
6+
7+
from eis_toolkit.feature_importance.feature_importance import evaluate_feature_importance
8+
9+
# here I set to paths
10+
data_to_load = "PUT PATH TO X"
11+
label_to_load = "PUT PATH TO Y"
12+
13+
if __name__ == "__main__":
14+
15+
feature_names = [
16+
"Mag_TMI",
17+
"Mag_AS",
18+
"DRC135",
19+
"DRC180",
20+
"DRC45",
21+
"DRC90",
22+
"Mag_TD",
23+
"HDTDR",
24+
"Mag_Xdrv",
25+
"mag_Ydrv",
26+
"Mag_Zdrv",
27+
"Pseu_Grv",
28+
"Rd_U",
29+
"Rd_TC",
30+
"Rd_Th",
31+
"Rd_K",
32+
"EM_ratio",
33+
"EM_Ap_rs",
34+
"Em_Qd",
35+
"EM_Inph",
36+
]
37+
38+
# first things first let s load data
39+
X = pd.read_csv(f"{data_to_load}").to_numpy()
40+
y = pd.read_csv(f"{label_to_load}").to_numpy()
41+
42+
# standardize the content
43+
X = StandardScaler().fit_transform(X)
44+
45+
# now let s train a MLP classifier
46+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
47+
48+
# we can train a MLP classifier
49+
clf = MLPClassifier(solver="adam", alpha=0.001, hidden_layer_sizes=(16, 2), random_state=1)
50+
clf.fit(X_train, y_train.ravel())
51+
52+
# we evaluate feature here
53+
evaluated_feature_importance, dictionary_of_features = evaluate_feature_importance(
54+
clf=clf, x_test=X_test, y_test=y_test, feature_names=feature_names, number_of_repetition=50, random_state=0
55+
)
56+
57+
print(evaluated_feature_importance)
58+
59+
# how to create a chart from here
60+
imp = pd.Series(dictionary_of_features.importances_mean * 100, index=feature_names).sort_values(ascending=True)
61+
ax = imp.plot.barh()
62+
ax.set_title("MLP Permutation Importance")
63+
ax.figure.tight_layout()
64+
plt.xlabel("Importance (%)")
65+
plt.grid(axis="x", linestyle="--", alpha=0.6)
66+
plt.ylabel("Feature")
67+
for i, v in enumerate(imp):
68+
ax.text(v, i, f"{v:.1f}", color="blue", fontweight="bold", fontsize=8)
69+
plt.savefig("testing.png")

0 commit comments

Comments
 (0)