diff --git a/hst/Dockerfile b/hst/Dockerfile new file mode 100644 index 0000000..1784e8c --- /dev/null +++ b/hst/Dockerfile @@ -0,0 +1,14 @@ +FROM ghcr.io/timeeval/python3-base:0.3.0 + +LABEL maintainer="yannik.schroeder@tu-berlin.de" +LABEL org.opencontainers.image.licenses=MIT + +ENV ALGORITHM_MAIN="/app/algorithm.py" + +# install algorithm dependencies +COPY requirements.txt /app/ +RUN pip install -r /app/requirements.txt + +COPY manifest.json /app/ +COPY hst.py /app/ +COPY algorithm.py /app/ diff --git a/hst/LICENSE b/hst/LICENSE new file mode 100644 index 0000000..e594ec8 --- /dev/null +++ b/hst/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020-2022 Phillip Wenig and Sebastian Schmidl + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/hst/README.md b/hst/README.md new file mode 100644 index 0000000..07b70b1 --- /dev/null +++ b/hst/README.md @@ -0,0 +1,27 @@ +# Half Space Trees (HST) + +Half-space trees are an online variant of isolation forests. +They work well when anomalies are spread out. +However, they do not work well if anomalies are packed together in windows. + +||| +| :--- | :--- | +| Citekey | tan2011fast | +| Source Code | https://github.com/online-ml/river/blob/main/river/anomaly/hst.py | +| Learning type | unsupervised | +| Input dimensionality | multivariate | +||| + +## Parameters + +- `n_trees`: `int`, optional (default=10) + Number of trees to use. +- `height`: `int`, optional (default=8) + Height of each tree. A tree of height `h` is made up of `h + 1` levels and + therefore contains `2 ** (h + 1) - 1` nodes. +- `window_size`: `int`, optional (default=250) + Number of observations to use for calculating the mass at each node in each tree. + +## Citation format (for source code) + + > Tan, S.C., Ting, K.M. and Liu, T.F., 2011, June. Fast anomaly detection for streaming data. In Twenty-Second International Joint Conference on Artificial Intelligence.](https://www.ijcai.org/Proceedings/11/Papers/254.pdf) diff --git a/hst/algorithm.py b/hst/algorithm.py new file mode 100755 index 0000000..c2e454c --- /dev/null +++ b/hst/algorithm.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +import argparse +import json +import sys +import numpy as np +import pandas as pd + +from dataclasses import dataclass + +from numpy.lib.stride_tricks import sliding_window_view + +from hst import HalfSpaceTrees +from river import compose, preprocessing + + + +@dataclass +class CustomParameters: + n_trees: int = 10 + height: int = 8 + window_size: int = 250 + random_state: int = 42 + + +class AlgorithmArgs(argparse.Namespace): + @staticmethod + def from_sys_args() -> 'AlgorithmArgs': + args: dict = json.loads(sys.argv[1]) + custom_parameter_keys = dir(CustomParameters()) + filtered_parameters = dict(filter(lambda x: x[0] in custom_parameter_keys, args.get("customParameters", {}).items())) + args["customParameters"] = CustomParameters(**filtered_parameters) + return AlgorithmArgs(**args) + + +def set_random_state(config: AlgorithmArgs) -> None: + seed = config.customParameters.random_state + import random + random.seed(seed) + np.random.seed(seed) + +def read_csv_in_batches(filepath, batch_size): + iterator = pd.read_csv(filepath, chunksize=batch_size) + + for batch in iterator: + yield batch["value"].values + + +def main(config: AlgorithmArgs): + batch_size = 1024 + subsequence_length = 20 + + set_random_state(config) + + model = compose.Pipeline( + preprocessing.MinMaxScaler(), + HalfSpaceTrees(n_trees=config.customParameters.n_trees, height=config.customParameters.height, window_size=config.customParameters.window_size, + seed=config.customParameters.random_state) + ) + + scores = np.zeros(batch_size) + + for batch in read_csv_in_batches(config.dataInput, batch_size): + + subsequences = sliding_window_view(batch, window_shape=subsequence_length) + features = {i: 0 for i in range(subsequence_length)} + + for i, subsequence in enumerate(subsequences): + for j, value in enumerate(subsequence): + features[j] = value + model.learn_one(features) + scores[i] = model.score_one(features) + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Wrong number of arguments specified; expected a single json-string!") + exit(1) + + config = AlgorithmArgs.from_sys_args() + print(f"Config: {config}") + + if config.executionType == "train": + print("Nothing to train, finished!") + elif config.executionType == "execute": + main(config) + else: + raise ValueError(f"Unknown execution type '{config.executionType}'; expected either 'train' or 'execute'!") diff --git a/hst/hst.py b/hst/hst.py new file mode 100644 index 0000000..425f38b --- /dev/null +++ b/hst/hst.py @@ -0,0 +1,287 @@ +from __future__ import annotations + +import collections +import functools +import random + +from river import anomaly, base, tree + +__all__ = ["HalfSpaceTrees"] + + +class HSTBranch(tree.base.Branch): + def __init__(self, left, right, feature, threshold, l_mass, r_mass): + super().__init__(left, right) + self.feature = feature + self.threshold = threshold + self.l_mass = l_mass + self.r_mass = r_mass + + @property + def left(self): + return self.children[0] + + @property + def right(self): + return self.children[1] + + def next(self, x): + """ + + We want to handle the case where a split feature is missing. In that case, we go down the + child that has been the most visited in the past. + + """ + left, right = self.children + try: + value = x[self.feature] + except KeyError: + if left.l_mass < right.l_mass: + return right + return left + if value < self.threshold: + return left + return right + + def most_common_path(self): + raise NotImplementedError + + @property + def repr_split(self): + return f"{self.feature} < {self.threshold:.5f}" + + +class HSTLeaf(tree.base.Leaf): + def __repr__(self): + return str(self.r_mass) + + +def make_padded_tree(limits, height, padding, rng=random, **node_params): + if height == 0: + return HSTLeaf(**node_params) + + # Randomly pick a feature + # We weight each feature by the gap between each feature's limits + on = rng.choices( + population=list(limits.keys()), + weights=[limits[i][1] - limits[i][0] for i in limits], + )[0] + + # Pick a split point; use padding to avoid too narrow a split + a = limits[on][0] + b = limits[on][1] + at = rng.uniform(a + padding * (b - a), b - padding * (b - a)) + + # Build the left node + tmp = limits[on] + limits[on] = (tmp[0], at) + left = make_padded_tree( + limits=limits, height=height - 1, padding=padding, rng=rng, **node_params + ) + limits[on] = tmp + + # Build the right node + tmp = limits[on] + limits[on] = (at, tmp[1]) + right = make_padded_tree( + limits=limits, height=height - 1, padding=padding, rng=rng, **node_params + ) + limits[on] = tmp + + return HSTBranch(left=left, right=right, feature=on, threshold=at, **node_params) + + +class HalfSpaceTrees(anomaly.base.AnomalyDetector): + """Half-Space Trees (HST). + + Half-space trees are an online variant of isolation forests. They work well when anomalies are + spread out. However, they do not work well if anomalies are packed together in windows. + + By default, this implementation assumes that each feature has values that are comprised + between 0 and 1. If this isn't the case, then you can manually specify the limits via the + `limits` argument. If you do not know the limits in advance, then you can use a + `preprocessing.MinMaxScaler` as an initial preprocessing step. + + The current implementation builds the trees the first time the `learn_one` method is called. + Therefore, the first `learn_one` call might be slow, whereas subsequent calls will be very fast + in comparison. In general, the computation time of both `learn_one` and `score_one` scales + linearly with the number of trees, and exponentially with the height of each tree. + + Note that high scores indicate anomalies, whereas low scores indicate normal observations. + + Parameters + ---------- + n_trees + Number of trees to use. + height + Height of each tree. Note that a tree of height `h` is made up of `h + 1` levels and + therefore contains `2 ** (h + 1) - 1` nodes. + window_size + Number of observations to use for calculating the mass at each node in each tree. + limits + Specifies the range of each feature. By default each feature is assumed to be in + range `[0, 1]`. + seed + Random number seed. + + Examples + -------- + + >>> from river import anomaly + + >>> X = [0.5, 0.45, 0.43, 0.44, 0.445, 0.45, 0.0] + >>> hst = anomaly.HalfSpaceTrees( + ... n_trees=5, + ... height=3, + ... window_size=3, + ... seed=42 + ... ) + + >>> for x in X[:3]: + ... hst.learn_one({'x': x}) # Warming up + + >>> for x in X: + ... features = {'x': x} + ... hst.learn_one(features) + ... print(f'Anomaly score for x={x:.3f}: {hst.score_one(features):.3f}') + Anomaly score for x=0.500: 0.107 + Anomaly score for x=0.450: 0.071 + Anomaly score for x=0.430: 0.107 + Anomaly score for x=0.440: 0.107 + Anomaly score for x=0.445: 0.107 + Anomaly score for x=0.450: 0.071 + Anomaly score for x=0.000: 0.853 + + The feature values are all comprised between 0 and 1. This is what is assumed by the model + by default. In the following example, we construct a pipeline that scales the data online + and ensures that the values of each feature are comprised between 0 and 1. + + >>> from river import compose + >>> from river import datasets + >>> from river import metrics + >>> from river import preprocessing + + >>> model = compose.Pipeline( + ... preprocessing.MinMaxScaler(), + ... anomaly.HalfSpaceTrees(seed=42) + ... ) + + >>> auc = metrics.ROCAUC() + + >>> for x, y in datasets.CreditCard().take(2500): + ... score = model.score_one(x) + ... model.learn_one(x) + ... auc.update(y, score) + + >>> auc + ROCAUC: 91.15% + + You can also use the `evaluate.progressive_val_score` function to evaluate the model on a + data stream. + + >>> from river import evaluate + + >>> model = model.clone() + + >>> evaluate.progressive_val_score( + ... dataset=datasets.CreditCard().take(2500), + ... model=model, + ... metric=metrics.ROCAUC(), + ... print_every=1000 + ... ) + [1,000] ROCAUC: 88.43% + [2,000] ROCAUC: 89.28% + [2,500] ROCAUC: 91.15% + ROCAUC: 91.15% + + References + ---------- + [^1]: [Tan, S.C., Ting, K.M. and Liu, T.F., 2011, June. Fast anomaly detection for streaming data. In Twenty-Second International Joint Conference on Artificial Intelligence.](https://www.ijcai.org/Proceedings/11/Papers/254.pdf) + + """ + + def __init__( + self, + n_trees=10, + height=8, + window_size=250, + limits: dict[base.typing.FeatureName, tuple[float, float]] | None = None, + seed: int | None = None, + ): + self.n_trees = n_trees + self.window_size = window_size + self.height = height + self.limits: collections.defaultdict = collections.defaultdict( + functools.partial(tuple, (0.0, 1.0)) + ) + if limits is not None: + self.limits.update(limits) + self.seed = seed + self.rng = random.Random(seed) + + self.trees: list[HSTBranch] = [] + self.counter = 0 + self._first_window = True + + @property + def size_limit(self): + """This is the threshold under which the node search stops during the scoring phase. + + The value .1 is a magic constant indicated in the original paper. + + """ + return 0.1 * self.window_size + + @property + def _max_score(self): + """The largest potential anomaly score.""" + return self.n_trees * self.window_size * (2 ** (self.height + 1) - 1) + + def learn_one(self, x): + # The trees are built when the first observation comes in + + if not self.trees: + self.trees = [ + make_padded_tree( + limits={i: self.limits[i] for i in sorted(x)}, + height=self.height, + padding=0.15, + rng=self.rng, + # kwargs + r_mass=0, + l_mass=0, + ) + for _ in range(self.n_trees) + ] + + # Update each tree + for t in self.trees: + for node in t.walk(x): + node.l_mass += 1 + + # Pivot the masses if necessary + self.counter += 1 + if self.counter == self.window_size: + for t in self.trees: + for node in t.iter_dfs(): + node.r_mass = node.l_mass + node.l_mass = 0 + self._first_window = False + self.counter = 0 + + def score_one(self, x): + if self._first_window: + return 0 + + score = 0.0 + for t in self.trees: + for depth, node in enumerate(t.walk(x)): + score += node.r_mass * 2**depth + if node.r_mass < self.size_limit: + break + + # Normalize the score between 0 and 1 + score /= self._max_score + + # We want high score -> anomaly, but we have high score -> normal + return 1 - score diff --git a/hst/manifest.json b/hst/manifest.json new file mode 100644 index 0000000..f68f1c8 --- /dev/null +++ b/hst/manifest.json @@ -0,0 +1,43 @@ +{ + "title": "HST", + "description": "Implementation of https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=73b6b7d9e7e225719ad86234927a3b60a4a873c0.", + "inputDimensionality": "multivariate", + "version": "0.3.1", + "authors": "Swee Chuan Tan, Kai Ming Ting, Tony Fei Liu", + "language": "Python", + "type": "Detector", + "mainFile": "algorithm.py", + "learningType": "unsupervised", + "executionStep": { + "parameters": [ + { + "name": "n_trees", + "type": "int", + "defaultValue": 10, + "optional": "true", + "description": "Number of trees to build." + },{ + "name": "height", + "type": "int", + "defaultValue": 8, + "optional": "true", + "description": "Height of each tree. Note that a tree of height `h` is made up of `h + 1` levels and therefore contains `2 ** (h + 1) - 1` nodes." + }, + { + "name": "window_size", + "type": "int", + "defaultValue": 250, + "optional": "true", + "description": "Number of observations to use for calculating the mass at each node in each tree." + }, + { + "name": "random_state", + "type": "int", + "defaultValue": 42, + "optional": "true", + "description": "Seed for random number generation." + } + ], + "modelInput": "none" + } +} diff --git a/hst/requirements.txt b/hst/requirements.txt new file mode 100644 index 0000000..6e46cf3 --- /dev/null +++ b/hst/requirements.txt @@ -0,0 +1,3 @@ +numpy>=1.19.5 +pandas>=1.2.1 +river