Skip to content

Commit 3277904

Browse files
committed
Implemented offset and predict in anomaly
1 parent dbd4d98 commit 3277904

2 files changed

Lines changed: 42 additions & 64 deletions

File tree

distclassipy/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
)
2929
from .distances import _ALL_METRICS, _UNIQUE_METRICS
3030

31-
__version__ = "0.2.2a3"
31+
__version__ = "0.2.2a4"
3232

3333
__all__ = [
3434
"DistanceMetricClassifier",

distclassipy/anomaly.py

Lines changed: 41 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,9 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "DistanceAnomaly":
109109
self.metrics_ = self.metrics
110110

111111
# Calculate anomaly threshold based on train scores
112-
# train_scores = self.decision_function(X)
113-
# self.offset_ = np.quantile(train_scores, 1.0 - self.contamination)
112+
train_scores = self.decision_function(X)
113+
114+
self.offset_ = np.quantile(train_scores, 1.0 - self.contamination)
114115

115116
return self
116117

@@ -135,11 +136,10 @@ def decision_function(self, X: np.ndarray) -> np.ndarray:
135136
metric_scores = []
136137

137138
for metric in self.metrics_:
138-
# Get dataframe for distances to all centroids from dcpy
139139
self.clf_.predict_and_analyse(X, metric=metric)
140140
dist_df = self.clf_.centroid_dist_df_
141141

142-
# 1. Aggregate distances across clusters the current metric
142+
# Aggregate distances across clusters the current metric
143143
if self.cluster_agg == "min":
144144
score_for_metric = dist_df.min(axis=1).values
145145
elif self.cluster_agg == "median":
@@ -153,32 +153,29 @@ def decision_function(self, X: np.ndarray) -> np.ndarray:
153153

154154
metric_scores_arr = np.array(metric_scores).T # shape (n_samples, n_metrics)
155155
# remove infinities
156-
metric_scores_arr[metric_scores_arr == np.inf] = 1e9 # A large number
157-
metric_scores_arr[metric_scores_arr == -np.inf] = -1e9 # A large negative number
158-
156+
metric_scores_arr[metric_scores_arr == np.inf] = 1e9 # A large number
157+
metric_scores_arr[metric_scores_arr == -np.inf] = (
158+
-1e9
159+
) # A large negative number
159160

160161
if self.normalize_scores:
161-
# Scale scores for each metric (column) to be between 0 and 1
162-
# Compare with Rio notebook once.
162+
col_means = np.nanmean(metric_scores_arr, axis=0)
163+
inds = np.where(np.isnan(metric_scores_arr))
164+
metric_scores_arr[inds] = np.take(col_means, inds[1])
163165
metric_scores_arr = minmax_scale(metric_scores_arr, axis=0)
164-
165-
# 2. Aggregate scores across all metrics for final anomaly score
166+
167+
# Aggregate scores across all metrics for final anomaly score
166168
if self.metric_agg == "median":
167-
scores = np.median(metric_scores_arr, axis=1)
169+
scores = np.nanmedian(metric_scores_arr, axis=1)
168170
elif self.metric_agg == "mean":
169-
scores = np.mean(metric_scores_arr, axis=1)
171+
scores = np.nanmean(metric_scores_arr, axis=1)
170172
elif self.metric_agg == "min":
171-
scores = np.min(metric_scores_arr, axis=1)
173+
scores = np.nanmin(metric_scores_arr, axis=1)
172174
elif self.metric_agg == "percentile_25":
173-
scores = np.quantile(metric_scores_arr, 0.25, axis=1)
175+
scores = np.nanquantile(metric_scores_arr, 0.25, axis=1)
174176
else:
175177
raise ValueError(f"Unknown metric_agg method: {self.metric_agg}")
176178

177-
# # Threshold for predict() as per sklearn conventions
178-
# ## NOTE: DATA LEAKAGE CONCERN
179-
# ## FIX LATER
180-
# self.offset_ = np.quantile(scores, (1 - self.contamination))
181-
182179
return scores
183180

184181
def score_samples(self, X: np.ndarray) -> np.ndarray:
@@ -190,46 +187,27 @@ def score_samples(self, X: np.ndarray) -> np.ndarray:
190187
"""
191188
return -self.decision_function(X)
192189

193-
# def predict(self, X: np.ndarray) -> np.ndarray:
194-
# """
195-
# Predict if a particular sample is an inlier (1) or outlie (-1).
196-
197-
# Parameters
198-
# ----------
199-
# X : array-like of shape (n_samples,)
200-
# The input samples.
201-
202-
# Returns
203-
# -------
204-
# is_outlier : ndarray of shape (n_samples,)
205-
# Returns -1 for outliers and 1 for inliers.
206-
# """
207-
# check_is_fitted(self)
208-
# scores = self.decision_function(X)
209-
# is_outlier = np.ones(X.shape[0], dtype=int)
210-
# is_outlier[scores >= self.offset_] = -1
211-
# return is_outlier
212-
213-
# def predict(self, X: np.ndarray) -> np.ndarray:
214-
# NOTE: UNCOMMENT AFTER FIXING ABOVE offset_ DATA LEAKAGE CONCERN
215-
# """
216-
# Predict if a particular sample is an inlier or outlier.
217-
218-
# Parameters
219-
# ----------
220-
# X : array-like of shape (n_samples,)
221-
# The input samples.
222-
223-
# Returns
224-
# -------
225-
# is_outlier : ndarray of shape (n_samples,)
226-
# Returns -1 for outliers and 1 for inliers.
227-
# """
228-
# scores = self.decision_function(X)
229-
# is_outlier = np.ones(X.shape[0], dtype=int)
230-
# is_outlier[scores >= self.offset_] = -1
231-
# return is_outlier
232-
233-
234-
# ref:
235-
# DOI: 10.2196/27172
190+
def predict(self, X: np.ndarray) -> np.ndarray:
191+
"""
192+
Predict if a particular sample is an inlier (1) or outlier (-1).
193+
194+
This method uses the threshold learned during the `fit` phase.
195+
196+
Parameters
197+
----------
198+
X : array-like of shape (n_samples, n_features)
199+
The input samples.
200+
201+
Returns
202+
-------
203+
is_outlier : ndarray of shape (n_samples,)
204+
Returns -1 for outliers and 1 for inliers.
205+
"""
206+
check_is_fitted(self)
207+
scores = self.decision_function(X)
208+
209+
# Compare scores against the pre-computed threshold
210+
is_outlier = np.full(X.shape[0], 1, dtype=int)
211+
is_outlier[scores >= self.offset_] = -1
212+
213+
return is_outlier

0 commit comments

Comments
 (0)