Implemented offset and predict in anomaly

sidchaini · sidchaini · commit 3277904fb517 · 2025-08-25T00:27:38.000-05:00
diff --git a/distclassipy/__init__.py b/distclassipy/__init__.py
@@ -28,7 +28,7 @@
 )
 from .distances import _ALL_METRICS, _UNIQUE_METRICS
 
-__version__ = "0.2.2a3"
+__version__ = "0.2.2a4"
 
 __all__ = [
     "DistanceMetricClassifier",
diff --git a/distclassipy/anomaly.py b/distclassipy/anomaly.py
@@ -109,8 +109,9 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "DistanceAnomaly":
             self.metrics_ = self.metrics
 
         # Calculate anomaly threshold based on train scores
-        # train_scores = self.decision_function(X)
-        # self.offset_ = np.quantile(train_scores, 1.0 - self.contamination)
+        train_scores = self.decision_function(X)
+
+        self.offset_ = np.quantile(train_scores, 1.0 - self.contamination)
 
         return self
 
@@ -135,11 +136,10 @@ def decision_function(self, X: np.ndarray) -> np.ndarray:
         metric_scores = []
 
         for metric in self.metrics_:
-            # Get dataframe for distances to all centroids from dcpy
             self.clf_.predict_and_analyse(X, metric=metric)
             dist_df = self.clf_.centroid_dist_df_
 
-            # 1. Aggregate distances across clusters the current metric
+            # Aggregate distances across clusters the current metric
             if self.cluster_agg == "min":
                 score_for_metric = dist_df.min(axis=1).values
             elif self.cluster_agg == "median":
@@ -153,32 +153,29 @@ def decision_function(self, X: np.ndarray) -> np.ndarray:
 
         metric_scores_arr = np.array(metric_scores).T  # shape (n_samples, n_metrics)
         # remove infinities
-        metric_scores_arr[metric_scores_arr == np.inf] = 1e9 # A large number
-        metric_scores_arr[metric_scores_arr == -np.inf] = -1e9 # A large negative number
-
+        metric_scores_arr[metric_scores_arr == np.inf] = 1e9  # A large number
+        metric_scores_arr[metric_scores_arr == -np.inf] = (
+            -1e9
+        )  # A large negative number
 
         if self.normalize_scores:
-            # Scale scores for each metric (column) to be between 0 and 1
-            # Compare with Rio notebook once.
+            col_means = np.nanmean(metric_scores_arr, axis=0)
+            inds = np.where(np.isnan(metric_scores_arr))
+            metric_scores_arr[inds] = np.take(col_means, inds[1])
             metric_scores_arr = minmax_scale(metric_scores_arr, axis=0)
-            
-        # 2. Aggregate scores across all metrics for final anomaly score
+
+        # Aggregate scores across all metrics for final anomaly score
         if self.metric_agg == "median":
-            scores = np.median(metric_scores_arr, axis=1)
+            scores = np.nanmedian(metric_scores_arr, axis=1)
         elif self.metric_agg == "mean":
-            scores = np.mean(metric_scores_arr, axis=1)
+            scores = np.nanmean(metric_scores_arr, axis=1)
         elif self.metric_agg == "min":
-            scores = np.min(metric_scores_arr, axis=1)
+            scores = np.nanmin(metric_scores_arr, axis=1)
         elif self.metric_agg == "percentile_25":
-            scores = np.quantile(metric_scores_arr, 0.25, axis=1)
+            scores = np.nanquantile(metric_scores_arr, 0.25, axis=1)
         else:
             raise ValueError(f"Unknown metric_agg method: {self.metric_agg}")
 
-        # # Threshold for predict() as per sklearn conventions
-        # ## NOTE: DATA LEAKAGE CONCERN
-        # ## FIX LATER
-        # self.offset_ = np.quantile(scores, (1 - self.contamination))
-
         return scores
 
     def score_samples(self, X: np.ndarray) -> np.ndarray:
@@ -190,46 +187,27 @@ def score_samples(self, X: np.ndarray) -> np.ndarray:
         """
         return -self.decision_function(X)
 
-    # def predict(self, X: np.ndarray) -> np.ndarray:
-    #     """
-    #     Predict if a particular sample is an inlier (1) or outlie (-1).
-
-    #     Parameters
-    #     ----------
-    #     X : array-like of shape (n_samples,)
-    #         The input samples.
-
-    #     Returns
-    #     -------
-    #     is_outlier : ndarray of shape (n_samples,)
-    #         Returns -1 for outliers and 1 for inliers.
-    #     """
-    #     check_is_fitted(self)
-    #     scores = self.decision_function(X)
-    #     is_outlier = np.ones(X.shape[0], dtype=int)
-    #     is_outlier[scores >= self.offset_] = -1
-    #     return is_outlier
-
-    # def predict(self, X: np.ndarray) -> np.ndarray:
-    # NOTE: UNCOMMENT AFTER FIXING ABOVE offset_ DATA LEAKAGE CONCERN
-    #     """
-    #     Predict if a particular sample is an inlier or outlier.
-
-    #     Parameters
-    #     ----------
-    #     X : array-like of shape (n_samples,)
-    #         The input samples.
-
-    #     Returns
-    #     -------
-    #     is_outlier : ndarray of shape (n_samples,)
-    #         Returns -1 for outliers and 1 for inliers.
-    #     """
-    #     scores = self.decision_function(X)
-    #     is_outlier = np.ones(X.shape[0], dtype=int)
-    #     is_outlier[scores >= self.offset_] = -1
-    #     return is_outlier
-
-
-# ref:
-# DOI: 10.2196/27172
+    def predict(self, X: np.ndarray) -> np.ndarray:
+        """
+        Predict if a particular sample is an inlier (1) or outlier (-1).
+
+        This method uses the threshold learned during the `fit` phase.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        is_outlier : ndarray of shape (n_samples,)
+            Returns -1 for outliers and 1 for inliers.
+        """
+        check_is_fitted(self)
+        scores = self.decision_function(X)
+
+        # Compare scores against the pre-computed threshold
+        is_outlier = np.full(X.shape[0], 1, dtype=int)
+        is_outlier[scores >= self.offset_] = -1
+
+        return is_outlier

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@`
`28`	`28`	`)`
`29`	`29`	`from .distances import _ALL_METRICS, _UNIQUE_METRICS`
`30`	`30`
`31`		`-__version__ = "0.2.2a3"`
	`31`	`+__version__ = "0.2.2a4"`
`32`	`32`
`33`	`33`	`__all__ = [`
`34`	`34`	`"DistanceMetricClassifier",`