@@ -109,8 +109,9 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "DistanceAnomaly":
109109 self .metrics_ = self .metrics
110110
111111 # Calculate anomaly threshold based on train scores
112- # train_scores = self.decision_function(X)
113- # self.offset_ = np.quantile(train_scores, 1.0 - self.contamination)
112+ train_scores = self .decision_function (X )
113+
114+ self .offset_ = np .quantile (train_scores , 1.0 - self .contamination )
114115
115116 return self
116117
@@ -135,11 +136,10 @@ def decision_function(self, X: np.ndarray) -> np.ndarray:
135136 metric_scores = []
136137
137138 for metric in self .metrics_ :
138- # Get dataframe for distances to all centroids from dcpy
139139 self .clf_ .predict_and_analyse (X , metric = metric )
140140 dist_df = self .clf_ .centroid_dist_df_
141141
142- # 1. Aggregate distances across clusters the current metric
142+ # Aggregate distances across clusters the current metric
143143 if self .cluster_agg == "min" :
144144 score_for_metric = dist_df .min (axis = 1 ).values
145145 elif self .cluster_agg == "median" :
@@ -153,32 +153,29 @@ def decision_function(self, X: np.ndarray) -> np.ndarray:
153153
154154 metric_scores_arr = np .array (metric_scores ).T # shape (n_samples, n_metrics)
155155 # remove infinities
156- metric_scores_arr [metric_scores_arr == np .inf ] = 1e9 # A large number
157- metric_scores_arr [metric_scores_arr == - np .inf ] = - 1e9 # A large negative number
158-
156+ metric_scores_arr [metric_scores_arr == np .inf ] = 1e9 # A large number
157+ metric_scores_arr [metric_scores_arr == - np .inf ] = (
158+ - 1e9
159+ ) # A large negative number
159160
160161 if self .normalize_scores :
161- # Scale scores for each metric (column) to be between 0 and 1
162- # Compare with Rio notebook once.
162+ col_means = np .nanmean (metric_scores_arr , axis = 0 )
163+ inds = np .where (np .isnan (metric_scores_arr ))
164+ metric_scores_arr [inds ] = np .take (col_means , inds [1 ])
163165 metric_scores_arr = minmax_scale (metric_scores_arr , axis = 0 )
164-
165- # 2. Aggregate scores across all metrics for final anomaly score
166+
167+ # Aggregate scores across all metrics for final anomaly score
166168 if self .metric_agg == "median" :
167- scores = np .median (metric_scores_arr , axis = 1 )
169+ scores = np .nanmedian (metric_scores_arr , axis = 1 )
168170 elif self .metric_agg == "mean" :
169- scores = np .mean (metric_scores_arr , axis = 1 )
171+ scores = np .nanmean (metric_scores_arr , axis = 1 )
170172 elif self .metric_agg == "min" :
171- scores = np .min (metric_scores_arr , axis = 1 )
173+ scores = np .nanmin (metric_scores_arr , axis = 1 )
172174 elif self .metric_agg == "percentile_25" :
173- scores = np .quantile (metric_scores_arr , 0.25 , axis = 1 )
175+ scores = np .nanquantile (metric_scores_arr , 0.25 , axis = 1 )
174176 else :
175177 raise ValueError (f"Unknown metric_agg method: { self .metric_agg } " )
176178
177- # # Threshold for predict() as per sklearn conventions
178- # ## NOTE: DATA LEAKAGE CONCERN
179- # ## FIX LATER
180- # self.offset_ = np.quantile(scores, (1 - self.contamination))
181-
182179 return scores
183180
184181 def score_samples (self , X : np .ndarray ) -> np .ndarray :
@@ -190,46 +187,27 @@ def score_samples(self, X: np.ndarray) -> np.ndarray:
190187 """
191188 return - self .decision_function (X )
192189
193- # def predict(self, X: np.ndarray) -> np.ndarray:
194- # """
195- # Predict if a particular sample is an inlier (1) or outlie (-1).
196-
197- # Parameters
198- # ----------
199- # X : array-like of shape (n_samples,)
200- # The input samples.
201-
202- # Returns
203- # -------
204- # is_outlier : ndarray of shape (n_samples,)
205- # Returns -1 for outliers and 1 for inliers.
206- # """
207- # check_is_fitted(self)
208- # scores = self.decision_function(X)
209- # is_outlier = np.ones(X.shape[0], dtype=int)
210- # is_outlier[scores >= self.offset_] = -1
211- # return is_outlier
212-
213- # def predict(self, X: np.ndarray) -> np.ndarray:
214- # NOTE: UNCOMMENT AFTER FIXING ABOVE offset_ DATA LEAKAGE CONCERN
215- # """
216- # Predict if a particular sample is an inlier or outlier.
217-
218- # Parameters
219- # ----------
220- # X : array-like of shape (n_samples,)
221- # The input samples.
222-
223- # Returns
224- # -------
225- # is_outlier : ndarray of shape (n_samples,)
226- # Returns -1 for outliers and 1 for inliers.
227- # """
228- # scores = self.decision_function(X)
229- # is_outlier = np.ones(X.shape[0], dtype=int)
230- # is_outlier[scores >= self.offset_] = -1
231- # return is_outlier
232-
233-
234- # ref:
235- # DOI: 10.2196/27172
190+ def predict (self , X : np .ndarray ) -> np .ndarray :
191+ """
192+ Predict if a particular sample is an inlier (1) or outlier (-1).
193+
194+ This method uses the threshold learned during the `fit` phase.
195+
196+ Parameters
197+ ----------
198+ X : array-like of shape (n_samples, n_features)
199+ The input samples.
200+
201+ Returns
202+ -------
203+ is_outlier : ndarray of shape (n_samples,)
204+ Returns -1 for outliers and 1 for inliers.
205+ """
206+ check_is_fitted (self )
207+ scores = self .decision_function (X )
208+
209+ # Compare scores against the pre-computed threshold
210+ is_outlier = np .full (X .shape [0 ], 1 , dtype = int )
211+ is_outlier [scores >= self .offset_ ] = - 1
212+
213+ return is_outlier
0 commit comments