TimeEval · yschroeder97 · Jan 6, 2025
diff --git a/hst/Dockerfile b/hst/Dockerfile
@@ -0,0 +1,14 @@
+FROM ghcr.io/timeeval/python3-base:0.3.0
+
+LABEL maintainer="[email protected]"
+LABEL org.opencontainers.image.licenses=MIT
+
+ENV ALGORITHM_MAIN="/app/algorithm.py"
+
+# install algorithm dependencies
+COPY requirements.txt /app/
+RUN pip install -r /app/requirements.txt
+
+COPY manifest.json /app/
+COPY hst.py /app/
+COPY algorithm.py /app/
diff --git a/hst/LICENSE b/hst/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020-2022 Phillip Wenig and Sebastian Schmidl
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/hst/README.md b/hst/README.md
@@ -0,0 +1,27 @@
+# Half Space Trees (HST)
+
+Half-space trees are an online variant of isolation forests. 
+They work well when anomalies are spread out.
+However, they do not work well if anomalies are packed together in windows.
+
+|||
+| :--- | :--- |
+| Citekey | tan2011fast |
+| Source Code | https://github.com/online-ml/river/blob/main/river/anomaly/hst.py |
+| Learning type | unsupervised |
+| Input dimensionality | multivariate |
+|||
+
+## Parameters
+
+- `n_trees`: `int`, optional (default=10)  
+  Number of trees to use.
+- `height`: `int`, optional (default=8)  
+  Height of each tree. A tree of height `h` is made up of `h + 1` levels and
+  therefore contains `2 ** (h + 1) - 1` nodes.
+- `window_size`: `int`, optional (default=250)
+  Number of observations to use for calculating the mass at each node in each tree.
+
+## Citation format (for source code)
+
+ > Tan, S.C., Ting, K.M. and Liu, T.F., 2011, June. Fast anomaly detection for streaming data. In Twenty-Second International Joint Conference on Artificial Intelligence.](https://www.ijcai.org/Proceedings/11/Papers/254.pdf)
diff --git a/hst/algorithm.py b/hst/algorithm.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import sys
+import numpy as np
+import pandas as pd
+
+from dataclasses import dataclass
+
+from numpy.lib.stride_tricks import sliding_window_view
+
+from hst import HalfSpaceTrees
+from river import compose, preprocessing
+
+
+
+@dataclass
+class CustomParameters:
+    n_trees: int = 10
+    height: int = 8 
+    window_size: int = 250
+    random_state: int = 42
+
+
+class AlgorithmArgs(argparse.Namespace):
+    @staticmethod
+    def from_sys_args() -> 'AlgorithmArgs':
+        args: dict = json.loads(sys.argv[1])
+        custom_parameter_keys = dir(CustomParameters())
+        filtered_parameters = dict(filter(lambda x: x[0] in custom_parameter_keys, args.get("customParameters", {}).items()))
+        args["customParameters"] = CustomParameters(**filtered_parameters)
+        return AlgorithmArgs(**args)
+
+
+def set_random_state(config: AlgorithmArgs) -> None:
+    seed = config.customParameters.random_state
+    import random
+    random.seed(seed)
+    np.random.seed(seed)
+
+def read_csv_in_batches(filepath, batch_size):
+    iterator = pd.read_csv(filepath, chunksize=batch_size)
+
+    for batch in iterator:
+        yield batch["value"].values
+
+
+def main(config: AlgorithmArgs):
+    batch_size = 1024
+    subsequence_length = 20
+
+    set_random_state(config)
+
+    model = compose.Pipeline(
+            preprocessing.MinMaxScaler(),
+            HalfSpaceTrees(n_trees=config.customParameters.n_trees, height=config.customParameters.height, window_size=config.customParameters.window_size,
+            seed=config.customParameters.random_state)
+    )
+
+    scores = np.zeros(batch_size)
+
+    for batch in read_csv_in_batches(config.dataInput, batch_size):
+
+        subsequences = sliding_window_view(batch, window_shape=subsequence_length)
+        features = {i: 0 for i in range(subsequence_length)}
+
+        for i, subsequence in enumerate(subsequences):
+            for j, value in enumerate(subsequence):
+                features[j] = value
+            model.learn_one(features)
+            scores[i] = model.score_one(features)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Wrong number of arguments specified; expected a single json-string!")
+        exit(1)
+
+    config = AlgorithmArgs.from_sys_args()
+    print(f"Config: {config}")
+
+    if config.executionType == "train":
+        print("Nothing to train, finished!")
+    elif config.executionType == "execute":
+        main(config)
+    else:
+        raise ValueError(f"Unknown execution type '{config.executionType}'; expected either 'train' or 'execute'!")