From 424d3cdee59b5349553e72868d14ad6cda63680e Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Tue, 1 Jul 2025 11:23:39 +0200
Subject: [PATCH 01/33] added conda environment, adapted code to pytorch 2.6
 and added Singularity setup scripts for postgres as an alternative to docker
 containers for cluster environments

---
 .gitignore         |  10 +++
 Makefile           |  64 ++++++++++++++++
 environment.yaml   | 184 +++++++++++++++++++++++++++++++++++++++++++++
 pidsmaker/main.py  |  29 ++++++-
 postgres-start.sh  | 121 +++++++++++++++++++++++++++++
 postgres-status.sh |  80 ++++++++++++++++++++
 postgres-stop.sh   |  57 ++++++++++++++
 7 files changed, 541 insertions(+), 4 deletions(-)
 create mode 100644 Makefile
 create mode 100644 environment.yaml
 create mode 100755 postgres-start.sh
 create mode 100755 postgres-status.sh
 create mode 100755 postgres-stop.sh

diff --git a/.gitignore b/.gitignore
index 3ab4b839..a84235d0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -169,3 +169,13 @@ nohup.out
 artifacts/
 data/
 docs/site/
+postgres_data/
+postgres_lock/
+
+# Singularity files
+*.sif
+*.def
+
+# Postgres directories
+postgres_config/
+postgres_run/
diff --git a/Makefile b/Makefile
new file mode 100644
index 00000000..cc32c1b5
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,64 @@
+# Makefile for Singularity PostgreSQL management
+
+.PHONY: up down status load-dumps full-setup logs clean help
+
+# PostgreSQL management
+up:
+	@./postgres-start.sh
+
+down:
+	@./postgres-stop.sh
+
+status:
+	@./postgres-status.sh
+
+logs:
+	@echo "PostgreSQL logs:"
+	@tail -f postgres_log/postgresql*.log 2>/dev/null || echo "No logs found"
+
+clean: down
+	@echo "Cleaning up PostgreSQL data..."
+	@rm -rf postgres_data postgres_run postgres_log
+	@echo "PostgreSQL data cleaned"
+
+reset: clean up
+
+app-build:
+	@echo "Building PIDSMaker container..."
+	@singularity build pidsmaker.sif pidsmaker.def || echo "Build failed - check if you have fakeroot access"
+
+app-run: up
+	@echo "Running PIDSMaker application..."
+	@singularity run --nv \
+		--env DB_HOST=localhost \
+		--env DOCKER_PORT=5432 \
+		--env DB_USER=postgres \
+		--env DB_PASSWORD=postgres \
+		--bind ${PWD}:/workspace \
+		pidsmaker.sif
+
+load-dumps: up
+	@echo "Loading database dumps from inside container..."
+	@if [ -f "./settings/scripts/load_dumps.sh" ]; then \
+		echo "Found load_dumps.sh, executing inside container..."; \
+		singularity exec instance://postgres_instance /scripts/load_dumps.sh; \
+	    else \
+		echo "Error: ./settings/scripts/load_dumps.sh not found"; \
+		exit 1; \
+	    fi
+
+full-setup: up load-dumps
+	@echo "PostgreSQL setup complete with dumps loaded"
+
+help:
+	@echo "Available commands:"
+	@echo "  postgres-up     - Start PostgreSQL"
+	@echo "  postgres-down   - Stop PostgreSQL"
+	@echo "  postgres-status - Check PostgreSQL status"
+	@echo "  postgres-logs   - Show PostgreSQL logs"
+	@echo "  postgres-load-dumps - Load database dumps"
+	@echo "  postgres-full-setup - Start PostgreSQL and load dumps"
+	@echo "  postgres-clean  - Stop and remove all data"
+	@echo "  postgres-reset  - Clean and restart PostgreSQL"
+	@echo "  app-build       - Build PIDSMaker container"
+	@echo "  app-run         - Run PIDSMaker with PostgreSQL"
diff --git a/environment.yaml b/environment.yaml
new file mode 100644
index 00000000..7d0678d1
--- /dev/null
+++ b/environment.yaml
@@ -0,0 +1,184 @@
+name: pids
+channels:
+  - conda-forge
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=3_kmp_llvm
+  - bzip2=1.0.8=h4bc722e_7
+  - ca-certificates=2025.6.15=hbd8a1cb_0
+  - certifi=2025.6.15=pyhd8ed1ab_0
+  - colorama=0.4.6=pyhd8ed1ab_1
+  - cyrus-sasl=2.1.28=hd9c7081_0
+  - icu=75.1=he02047a_0
+  - keyutils=1.6.1=h166bdaf_0
+  - krb5=1.21.3=h659f571_0
+  - lcms2=2.17=h717163a_0
+  - ld_impl_linux-64=2.43=h1423503_5
+  - lerc=4.0.0=h0aef613_1
+  - libdeflate=1.24=h86f0d12_0
+  - libedit=3.1.20250104=pl5321h7949ede_0
+  - libexpat=2.7.0=h5888daf_0
+  - libffi=3.4.6=h2dba641_1
+  - libfreetype=2.13.3=ha770c72_1
+  - libfreetype6=2.13.3=h48d6fc4_1
+  - libgcc=15.1.0=h767d61c_3
+  - libgcc-ng=15.1.0=h69a702a_3
+  - libgomp=15.1.0=h767d61c_3
+  - libjpeg-turbo=3.1.0=hb9d3cd8_0
+  - liblzma=5.8.1=hb9d3cd8_2
+  - libnsl=2.0.1=hb9d3cd8_1
+  - libntlm=1.8=hb9d3cd8_0
+  - libpng=1.6.49=h943b412_0
+  - libpq=17.5=h27ae623_0
+  - libsqlite=3.50.1=h6cd9bfd_7
+  - libstdcxx=15.1.0=h8f9b012_3
+  - libstdcxx-ng=15.1.0=h4852527_3
+  - libtiff=4.7.0=hf01ce69_5
+  - libuuid=2.38.1=h0b41bf4_0
+  - libwebp-base=1.5.0=h851e524_0
+  - libxcb=1.17.0=h8a09558_0
+  - libxcrypt=4.4.36=hd590300_1
+  - libzlib=1.3.1=hb9d3cd8_2
+  - llvm-openmp=20.1.7=h024ca30_0
+  - ncurses=6.5=h2d0b736_3
+  - openjpeg=2.5.3=h5fbd93e_0
+  - openldap=2.6.10=he970967_0
+  - openssl=3.5.0=h7b32b05_1
+  - pillow=11.2.1=py39h15c0740_0
+  - pip=25.1.1=pyh8b19718_0
+  - psycopg2=2.9.10=py39h2bc273e_1
+  - pthread-stubs=0.4=hb9d3cd8_1002
+  - python=3.9.23=hc30ae73_0_cpython
+  - python_abi=3.9=7_cp39
+  - readline=8.2=h8c095d6_2
+  - tk=8.6.13=noxft_hd72426e_102
+  - tqdm=4.67.1=pyhd8ed1ab_1
+  - wheel=0.45.1=pyhd8ed1ab_1
+  - xorg-libxau=1.0.12=hb9d3cd8_0
+  - xorg-libxdmcp=1.1.5=hb9d3cd8_0
+  - zstd=1.5.7=hb8e6e7a_2
+  - pip:
+      - aiohappyeyeballs==2.6.1
+      - aiohttp==3.12.13
+      - aiosignal==1.3.2
+      - appdirs==1.4.4
+      - async-timeout==5.0.1
+      - attrs==25.3.0
+      - babel==2.17.0
+      - backrefs==5.9
+      - beautifulsoup4==4.13.4
+      - cairocffi==1.7.0
+      - cfgv==3.4.0
+      - chardet==5.2.0
+      - charset-normalizer==3.4.2
+      - click==8.1.8
+      - contourpy==1.3.0
+      - coverage==7.9.1
+      - cycler==0.12.1
+      - distlib==0.3.9
+      - docker-pycreds==0.4.0
+      - exceptiongroup==1.3.0
+      - filelock==3.18.0
+      - fonttools==4.58.4
+      - frozenlist==1.7.0
+      - fsspec==2025.5.1
+      - gdown==5.2.0
+      - gensim==4.3.1
+      - ghp-import==2.1.0
+      - gitdb==4.0.12
+      - gitpython==3.1.44
+      - graphviz==0.20.1
+      - identify==2.6.12
+      - idna==3.10
+      - igraph==0.11.5
+      - importlib-metadata==8.7.0
+      - importlib-resources==6.5.2
+      - iniconfig==2.1.0
+      - jinja2==3.1.6
+      - joblib==1.5.1
+      - kiwisolver==1.4.7
+      - markdown==3.8.2
+      - markupsafe==3.0.2
+      - matplotlib==3.8.4
+      - mergedeep==1.3.4
+      - mkdocs==1.6.1
+      - mkdocs-get-deps==0.2.0
+      - mkdocs-glightbox==0.4.0
+      - mkdocs-material==9.6.12
+      - mkdocs-material-extensions==1.3.1
+      - mpmath==1.3.0
+      - multidict==6.5.1
+      - networkx==2.8.7
+      - nltk==3.8.1
+      - nodeenv==1.9.1
+      - numpy==1.26.4
+      - nvidia-cublas-cu12==12.6.4.1
+      - nvidia-cuda-cupti-cu12==12.6.80
+      - nvidia-cuda-nvrtc-cu12==12.6.77
+      - nvidia-cuda-runtime-cu12==12.6.77
+      - nvidia-cudnn-cu12==9.5.1.17
+      - nvidia-cufft-cu12==11.3.0.4
+      - nvidia-cufile-cu12==1.11.1.6
+      - nvidia-curand-cu12==10.3.7.77
+      - nvidia-cusolver-cu12==11.7.1.2
+      - nvidia-cusparse-cu12==12.5.4.2
+      - nvidia-cusparselt-cu12==0.6.3
+      - nvidia-nccl-cu12==2.26.2
+      - nvidia-nvjitlink-cu12==12.6.85
+      - nvidia-nvtx-cu12==12.6.77
+      - packaging==25.0
+      - paginate==0.5.7
+      - pandas==2.2.2
+      - pathspec==0.12.1
+      - platformdirs==4.3.8
+      - pluggy==1.6.0
+      - pre-commit==4.2.0
+      - propcache==0.3.2
+      - protobuf==4.25.8
+      - psutil==7.0.0
+      - pyg-lib==0.4.0+pt27cu126
+      - pygments==2.19.2
+      - pymdown-extensions==10.16
+      - pyparsing==3.2.3
+      - pytest==8.3.5
+      - pytest-cov==6.1.1
+      - python-dateutil==2.9.0.post0
+      - pytz==2024.1
+      - pyyaml==6.0.2
+      - pyyaml-env-tag==1.1
+      - regex==2024.11.6
+      - requests==2.32.4
+      - scikit-learn==1.2.0
+      - scipy==1.10.1
+      - sentry-sdk==2.31.0
+      - setproctitle==1.3.6
+      - setuptools==61.0.0
+      - six==1.17.0
+      - smart-open==7.1.0
+      - smmap==5.0.2
+      - soupsieve==2.7
+      - sympy==1.14.0
+      - texttable==1.7.0
+      - threadpoolctl==3.6.0
+      - tomli==2.2.1
+      - torch==2.7.1
+      - torch-cluster==1.6.3+pt27cu126
+      - torch-geometric==2.5.3
+      - torch-scatter==2.1.2+pt27cu126
+      - torch-sparse==0.6.18+pt27cu126
+      - torch-spline-conv==1.2.2+pt27cu126
+      - torchaudio==2.7.1
+      - torchvision==0.22.1
+      - triton==3.3.1
+      - typing-extensions==4.14.0
+      - tzdata==2025.2
+      - urllib3==2.5.0
+      - virtualenv==20.31.2
+      - wandb==0.16.6
+      - watchdog==6.0.0
+      - wget==3.2
+      - wrapt==1.17.2
+      - xxhash==3.2.0
+      - yacs==0.1.8
+      - yarl==1.20.1
+      - zipp==3.23.0
diff --git a/pidsmaker/main.py b/pidsmaker/main.py
index 9fcf694e..561c59c3 100644
--- a/pidsmaker/main.py
+++ b/pidsmaker/main.py
@@ -5,7 +5,9 @@
 import time
 from collections import defaultdict
 
+import networkx as nx
 import torch
+import torch_geometric
 import wandb
 
 from pidsmaker.config import (
@@ -20,7 +22,10 @@
     gnn_training,
     graph_preprocessing,
 )
-from pidsmaker.experiments.tuning import fuse_cfg_with_sweep_cfg, get_tuning_sweep_cfg
+from pidsmaker.experiments.tuning import (
+    fuse_cfg_with_sweep_cfg,
+    get_tuning_sweep_cfg,
+)
 from pidsmaker.experiments.uncertainty import (
     avg_std_metrics,
     fuse_hyperparameter_metrics,
@@ -40,8 +45,17 @@
 from pidsmaker.triage import (
     tracing,
 )
+from pidsmaker.utils.data_utils import CollatableTemporalData
 from pidsmaker.utils.utils import log, remove_underscore_keys, set_seed
 
+torch.serialization.add_safe_globals(
+    [
+        nx.classes.multidigraph.MultiDiGraph,
+        CollatableTemporalData,
+        torch_geometric.data.storage.GlobalStorage,
+    ]
+)
+
 
 def get_task_to_module(cfg):
     return {
@@ -159,7 +173,10 @@ def run_pipeline_with_experiments(cfg):
 
                     hyper_to_metrics = defaultdict(list)
                     for hyper in hyperparameters:
-                        log(f"[@hyperparameter {hyper}] - Started", pre_return_line=True)
+                        log(
+                            f"[@hyperparameter {hyper}] - Started",
+                            pre_return_line=True,
+                        )
 
                         for i in range(iterations):
                             log(f"[@iteration {i}]", pre_return_line=True)
@@ -184,7 +201,11 @@ def run_pipeline_with_experiments(cfg):
                     for i in range(iterations):
                         log(f"[@iteration {i}]", pre_return_line=True)
                         cfg = update_cfg_for_uncertainty_exp(
-                            method, i, iterations, copy.deepcopy(original_cfg), hyperparameter=None
+                            method,
+                            i,
+                            iterations,
+                            copy.deepcopy(original_cfg),
+                            hyperparameter=None,
                         )
                         metrics, times = run_pipeline(cfg, method=method, iteration=i)
                         method_to_metrics[method].append({**metrics, **times})
@@ -261,7 +282,7 @@ def run_pipeline_from_sweep(cfg):
         project = "PIDSMaker"
 
     wandb.init(
-        mode="online" if (args.wandb and args.tuning_mode == "none") else "disabled",
+        mode=("online" if (args.wandb and args.tuning_mode == "none") else "disabled"),
         project=project,
         name=exp_name,
         tags=tags,
diff --git a/postgres-start.sh b/postgres-start.sh
new file mode 100755
index 00000000..1a5eb78b
--- /dev/null
+++ b/postgres-start.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+
+# PostgreSQL startup script for Singularity
+
+set -e
+
+# Configuration
+POSTGRES_IMAGE="postgres.sif"
+POSTGRES_INSTANCE="postgres_instance"
+DATA_DIR="postgres_data"
+RUN_DIR="postgres_run"
+LOG_DIR="postgres_log"
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+echo -e "${YELLOW}Starting PostgreSQL with Singularity...${NC}"
+
+# Check if postgres.sif exists
+if [ ! -f "$POSTGRES_IMAGE" ]; then
+    echo -e "${YELLOW}PostgreSQL image not found. Pulling from Docker Hub...${NC}"
+    singularity pull $POSTGRES_IMAGE docker://postgres:17
+fi
+
+# Create necessary directories
+echo -e "${YELLOW}Creating directories...${NC}"
+mkdir -p $DATA_DIR $RUN_DIR $LOG_DIR
+
+# Create INPUT_DIR if it doesn't exist
+INPUT_DIR=${INPUT_DIR:-$(pwd)/data}
+if [ ! -d "$INPUT_DIR" ]; then
+    echo -e "${YELLOW}Creating INPUT_DIR: $INPUT_DIR${NC}"
+    mkdir -p "$INPUT_DIR"
+fi
+
+# Check if instance already exists
+if singularity instance list | grep -q "$POSTGRES_INSTANCE"; then
+    echo -e "${YELLOW}PostgreSQL instance $POSTGRES_INSTANCE already exists${NC}"
+    # Check if it's responsive
+    if singularity exec instance://$POSTGRES_INSTANCE pg_isready -h localhost -U postgres > /dev/null 2>&1; then
+        echo -e "${GREEN}PostgreSQL instance is already running and responsive${NC}"
+        exit 0
+    else
+        echo -e "${YELLOW}Instance exists but not responsive, stopping it...${NC}"
+        singularity instance stop $POSTGRES_INSTANCE
+        sleep 2
+    fi
+fi
+
+# Check if any other postgres processes are running
+if pgrep -f "singularity.*postgres" > /dev/null; then
+    echo -e "${YELLOW}Other PostgreSQL processes detected, cleaning up...${NC}"
+    pkill -f "singularity.*postgres" || true
+    sleep 2
+fi
+
+# Set environment variables
+export SINGULARITYENV_POSTGRES_PASSWORD=postgres
+export SINGULARITYENV_POSTGRES_USER=postgres
+export SINGULARITYENV_POSTGRES_DB=postgres
+
+# Prepare bind mounts - only bind if files/directories exist
+BIND_MOUNTS="--bind $DATA_DIR:/var/lib/postgresql/data"
+BIND_MOUNTS="$BIND_MOUNTS --bind $RUN_DIR:/var/run/postgresql"
+BIND_MOUNTS="$BIND_MOUNTS --bind $LOG_DIR:/var/log"
+
+# Add optional bind mounts if they exist
+if [ -f "./postgres/init-create-empty-databases.sh" ]; then
+    BIND_MOUNTS="$BIND_MOUNTS --bind ./postgres/init-create-empty-databases.sh:/docker-entrypoint-initdb.d/init-create-empty-databases.sh"
+else
+    echo -e "${YELLOW}Warning: ./postgres/init-create-empty-databases.sh not found, skipping${NC}"
+fi
+
+if [ -d "./settings/scripts" ]; then
+    BIND_MOUNTS="$BIND_MOUNTS --bind ./settings/scripts:/scripts"
+else
+    echo -e "${YELLOW}Warning: ./settings/scripts directory not found, skipping${NC}"
+fi
+
+# Always bind INPUT_DIR
+BIND_MOUNTS="$BIND_MOUNTS --bind $INPUT_DIR:/data"
+
+if [ -f "./postgres_config/postgresql.conf" ]; then
+    BIND_MOUNTS="$BIND_MOUNTS --bind ./postgres_config/postgresql.conf:/etc/postgresql/postgresql.conf"
+fi
+
+# Start PostgreSQL instance
+echo -e "${YELLOW}Starting PostgreSQL instance...${NC}"
+echo -e "${YELLOW}Using INPUT_DIR: $INPUT_DIR${NC}"
+
+singularity instance start $BIND_MOUNTS $POSTGRES_IMAGE $POSTGRES_INSTANCE
+
+# Start PostgreSQL inside the instance
+echo -e "${YELLOW}Starting PostgreSQL server inside instance...${NC}"
+singularity exec instance://$POSTGRES_INSTANCE bash -c "docker-entrypoint.sh postgres &"
+
+# Get the PID of the instance (optional, for compatibility)
+INSTANCE_PID=$(pgrep -f "singularity.*$POSTGRES_INSTANCE" | head -1)
+if [ -n "$INSTANCE_PID" ]; then
+    echo $INSTANCE_PID > postgres.pid
+fi
+
+# Wait for PostgreSQL to be ready
+echo -e "${YELLOW}Waiting for PostgreSQL to start...${NC}"
+for i in {1..30}; do
+    if singularity exec instance://$POSTGRES_INSTANCE pg_isready -h localhost -U postgres > /dev/null 2>&1; then
+        echo -e "${GREEN}PostgreSQL is ready!${NC}"
+        echo -e "${GREEN}Connection: singularity exec instance://$POSTGRES_INSTANCE psql -h localhost -U postgres${NC}"
+        echo -e "${GREEN}Instance: $POSTGRES_INSTANCE${NC}"
+        exit 0
+    fi
+    echo -n "."
+    sleep 2
+done
+
+echo -e "${RED}PostgreSQL failed to start within 60 seconds${NC}"
+singularity instance stop $POSTGRES_INSTANCE 2>/dev/null || true
+exit 1
\ No newline at end of file
diff --git a/postgres-status.sh b/postgres-status.sh
new file mode 100755
index 00000000..e3669616
--- /dev/null
+++ b/postgres-status.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+
+# PostgreSQL status script for Singularity
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+echo -e "${YELLOW}PostgreSQL Status:${NC}"
+
+# Check multiple ways to detect if PostgreSQL is running
+POSTGRES_RUNNING=false
+
+# Method 1: Check PID file
+if [ -f postgres.pid ]; then
+    PID=$(cat postgres.pid)
+    if kill -0 $PID 2>/dev/null; then
+        echo -e "${GREEN}✓ PostgreSQL process is running (PID: $PID)${NC}"
+        POSTGRES_RUNNING=true
+    else
+        echo -e "${YELLOW}! PID file exists but process is not running${NC}"
+        rm postgres.pid
+    fi
+fi
+
+# Method 2: Check if we can connect (most reliable test)
+if [ ! -f postgres.sif ]; then
+    echo -e "${RED}✗ postgres.sif not found${NC}"
+    exit 1
+fi
+
+if singularity exec postgres.sif pg_isready -h localhost -U postgres > /dev/null 2>&1; then
+    echo -e "${GREEN}✓ PostgreSQL is accepting connections${NC}"
+    echo -e "${GREEN}  Connection: singularity exec postgres.sif psql -h localhost -U postgres${NC}"
+    POSTGRES_RUNNING=true
+    
+    # Show database list
+    echo -e "${YELLOW}Databases:${NC}"
+    singularity exec postgres.sif psql -h localhost -U postgres -c "\l" 2>/dev/null | \
+        grep -v template | grep -v "^-" | grep -v "^(" | grep -v "Name.*Owner" | \
+        grep -v "^\s*$" | head -10
+    
+    # Show PostgreSQL version
+    echo -e "${YELLOW}Version:${NC}"
+    singularity exec postgres.sif psql -h localhost -U postgres -c "SELECT version();" -t 2>/dev/null | head -1
+    
+else
+    echo -e "${RED}✗ PostgreSQL is not accepting connections${NC}"
+fi
+
+# Method 3: Check process list as fallback
+if [ "$POSTGRES_RUNNING" = false ]; then
+    # Check for any postgres-related processes with more flexible patterns
+    if pgrep -f "postgres" > /dev/null || pgrep -f "singularity.*postgres" > /dev/null; then
+        echo -e "${YELLOW}! Found postgres-related process but cannot connect${NC}"
+        echo -e "${YELLOW}  Process list:${NC}"
+        ps aux | grep -E "(postgres|singularity)" | grep -v grep | head -5
+    else
+        echo -e "${RED}✗ No PostgreSQL processes found${NC}"
+    fi
+fi
+
+# Method 4: Check if port 5432 is listening
+if ss -tlnp 2>/dev/null | grep -q ":5432 " || netstat -tlnp 2>/dev/null | grep -q ":5432 "; then
+    echo -e "${GREEN}✓ Port 5432 is listening${NC}"
+    POSTGRES_RUNNING=true
+else
+    echo -e "${RED}✗ Port 5432 is not listening${NC}"
+fi
+
+# Final status
+if [ "$POSTGRES_RUNNING" = true ]; then
+    echo -e "${GREEN}Overall Status: PostgreSQL is running and accessible${NC}"
+    exit 0
+else
+    echo -e "${RED}Overall Status: PostgreSQL is not running or not accessible${NC}"
+    exit 1
+fi
\ No newline at end of file
diff --git a/postgres-stop.sh b/postgres-stop.sh
new file mode 100755
index 00000000..729e8e25
--- /dev/null
+++ b/postgres-stop.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+# PostgreSQL shutdown script for Singularity
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+echo -e "${YELLOW}Stopping PostgreSQL...${NC}"
+
+STOPPED=false
+
+# Method 1: Stop Singularity instance if it exists
+if singularity instance list | grep -q "postgres_instance"; then
+    echo -e "${YELLOW}Stopping Singularity instance: postgres_instance${NC}"
+    singularity instance stop postgres_instance
+    STOPPED=true
+fi
+
+# Method 2: Stop using PID file if it exists
+if [ -f postgres.pid ]; then
+    PID=$(cat postgres.pid)
+    if kill -0 $PID 2>/dev/null; then
+        echo -e "${YELLOW}Stopping PostgreSQL process (PID: $PID)${NC}"
+        kill $PID
+        # Wait for graceful shutdown
+        for i in {1..10}; do
+            if ! kill -0 $PID 2>/dev/null; then
+                echo -e "${GREEN}PostgreSQL stopped gracefully${NC}"
+                STOPPED=true
+                break
+            fi
+            sleep 1
+        done
+        # Force kill if still running
+        if kill -0 $PID 2>/dev/null; then
+            echo -e "${YELLOW}Force killing PostgreSQL${NC}"
+            kill -9 $PID
+            STOPPED=true
+        fi
+    fi
+    rm postgres.pid
+fi
+
+# Method 3: Fallback - kill any singularity postgres processes
+if pkill -f "singularity.*postgres"; then
+    echo -e "${YELLOW}Killed remaining PostgreSQL processes${NC}"
+    STOPPED=true
+fi
+
+if [ "$STOPPED" = true ]; then
+    echo -e "${GREEN}PostgreSQL stopped${NC}"
+else
+    echo -e "${YELLOW}No PostgreSQL instances were found running${NC}"
+fi
\ No newline at end of file

From 96d300000e0d18f0f3901e6a2d1102f0377030e0 Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Tue, 1 Jul 2025 11:24:13 +0200
Subject: [PATCH 02/33] made load_dumps.sh idempotent

---
 settings/scripts/load_dumps.sh | 56 ++++++++++++++++++++++++++++++++--
 1 file changed, 53 insertions(+), 3 deletions(-)

diff --git a/settings/scripts/load_dumps.sh b/settings/scripts/load_dumps.sh
index 7c56e036..b476e228 100755
--- a/settings/scripts/load_dumps.sh
+++ b/settings/scripts/load_dumps.sh
@@ -1,9 +1,59 @@
 #!/bin/bash
 
+set -e  # Exit on any error
+
+echo "Starting database dump restoration..."
+
 for dump_file in /data/*.dump; do
+  # Check if any dump files exist
+  if [ ! -f "$dump_file" ]; then
+    echo "No .dump files found in /data/ directory"
+    break
+  fi
+  
   db_name=$(basename "$dump_file" .dump)
-
+  
+  echo "Processing $dump_file -> database '$db_name'"
+  
+  # Check if database already exists and has data
+  if psql -U postgres -h localhost -p 5432 -lqt | cut -d \| -f 1 | grep -qw "$db_name"; then
+    echo "Database '$db_name' already exists. Checking if it has data..."
+    
+    # Count tables in the database
+    table_count=$(psql -U postgres -h localhost -p 5432 -d "$db_name" -t -c "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'public';" 2>/dev/null || echo "0")
+    
+    if [ "$table_count" -gt 0 ]; then
+      echo "✓ Database '$db_name' already has $table_count tables. Skipping restoration."
+      continue
+    else
+      echo "Database '$db_name' exists but is empty. Proceeding with restoration..."
+    fi
+  else
+    # Create database if it doesn't exist
+    echo "Creating database '$db_name'..."
+    psql -U postgres -h localhost -p 5432 -c "CREATE DATABASE \"$db_name\";" 2>/dev/null || {
+      echo "Warning: Could not create database '$db_name' (may already exist)"
+    }
+  fi
+  
   echo "Restoring $dump_file into database '$db_name'..."
-
-  pg_restore -U postgres -h localhost -p 5432 -d "$db_name" "$dump_file"
+  
+  # Use --clean --if-exists to handle existing objects gracefully
+  if pg_restore -U postgres -h localhost -p 5432 --clean --if-exists --no-owner --no-privileges -d "$db_name" "$dump_file" 2>/dev/null; then
+    echo "✓ Successfully restored $dump_file"
+    
+    # Verify restoration
+    final_table_count=$(psql -U postgres -h localhost -p 5432 -d "$db_name" -t -c "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'public';" 2>/dev/null || echo "0")
+    echo "  Database '$db_name' now has $final_table_count tables"
+  else
+    echo "✗ Warning: pg_restore reported errors for $dump_file (this may be normal for some dump formats)"
+  fi
+  
+  echo ""
 done
+
+echo "Database dump restoration completed!"
+
+# Show summary of all databases
+echo "Summary of available databases:"
+psql -U postgres -h localhost -p 5432 -c "\l" | grep -E "^\s+[a-zA-Z]" | head -20

From bf55281058ebd31a7841997db06cb792908baa5c Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Tue, 1 Jul 2025 11:25:15 +0200
Subject: [PATCH 03/33] changed default artifact dir and postgres host to
 localhost (for cluster setup with conda and singularity)

---
 pidsmaker/config/pipeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pidsmaker/config/pipeline.py b/pidsmaker/config/pipeline.py
index 7112751e..f51aafa3 100644
--- a/pidsmaker/config/pipeline.py
+++ b/pidsmaker/config/pipeline.py
@@ -24,13 +24,13 @@
     Arg,
 )
 
-DEFAULT_ROOT_ARTIFACT_DIR = "/home/artifacts/"  # Destination folder (in the container) for generated files. Will be created if doesn't exist.
+DEFAULT_ROOT_ARTIFACT_DIR = "./artifacts/"  # Destination folder (in the container) for generated files. Will be created if doesn't exist.
 ROOT_PROJECT_PATH = pathlib.Path(__file__).parent.parent.parent.resolve()
 ROOT_GROUND_TRUTH_DIR = os.path.join(ROOT_PROJECT_PATH, "Ground_Truth/")
 
 
 DATABASE_DEFAULT_CONFIG = {
-    "host": "postgres",  # Host machine where the db is located
+    "host": "localhost",  # Host machine where the db is located
     "user": "postgres",  # Database user
     "password": "postgres",  # The password to the database user
     "port": "5432",  # The port number for Postgres

From 8481ed8a20206cb2e10251e360f498bff1a095a7 Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Tue, 1 Jul 2025 11:27:22 +0200
Subject: [PATCH 04/33] fixed error in config vals format

---
 pidsmaker/config/config.py | 170 ++++++++++++++++++++++++++++---------
 1 file changed, 130 insertions(+), 40 deletions(-)

diff --git a/pidsmaker/config/config.py b/pidsmaker/config/config.py
index 923cbeff..8c356c87 100644
--- a/pidsmaker/config/config.py
+++ b/pidsmaker/config/config.py
@@ -81,7 +81,11 @@
             "E5-CADETS/node_Nginx_Drakon_APT_17.csv",
         ],
         "attack_to_time_window": [
-            ["E5-CADETS/node_Nginx_Drakon_APT.csv", "2019-05-16 09:31:00", "2019-05-16 10:12:00"],
+            [
+                "E5-CADETS/node_Nginx_Drakon_APT.csv",
+                "2019-05-16 09:31:00",
+                "2019-05-16 10:12:00",
+            ],
             [
                 "E5-CADETS/node_Nginx_Drakon_APT_17.csv",
                 "2019-05-17 10:15:00",
@@ -117,10 +121,22 @@
             "E3-CADETS/node_Nginx_Backdoor_13.csv",
         ],
         "attack_to_time_window": [
-            ["E3-CADETS/node_Nginx_Backdoor_06.csv", "2018-04-06 11:20:00", "2018-04-06 12:09:00"],
+            [
+                "E3-CADETS/node_Nginx_Backdoor_06.csv",
+                "2018-04-06 11:20:00",
+                "2018-04-06 12:09:00",
+            ],
             # ["E3-CADETS/node_Nginx_Backdoor_11.csv" , '2018-04-11 15:07:00', '2018-04-11 15:16:00'],
-            ["E3-CADETS/node_Nginx_Backdoor_12.csv", "2018-04-12 13:59:00", "2018-04-12 14:39:00"],
-            ["E3-CADETS/node_Nginx_Backdoor_13.csv", "2018-04-13 09:03:00", "2018-04-13 09:16:00"],
+            [
+                "E3-CADETS/node_Nginx_Backdoor_12.csv",
+                "2018-04-12 13:59:00",
+                "2018-04-12 14:39:00",
+            ],
+            [
+                "E3-CADETS/node_Nginx_Backdoor_13.csv",
+                "2018-04-13 09:03:00",
+                "2018-04-13 09:16:00",
+            ],
         ],
     },
     "CLEARSCOPE_E5": {
@@ -131,7 +147,13 @@
         "num_edge_types": 10,
         "year_month": "2019-05",
         "start_end_day_range": (8, 18),
-        "train_files": ["graph_8", "graph_9", "graph_10", "graph_11", "graph_12"],
+        "train_files": [
+            "graph_8",
+            "graph_9",
+            "graph_10",
+            "graph_11",
+            "graph_12",
+        ],
         "val_files": ["graph_13"],
         "test_files": ["graph_14", "graph_15", "graph_17"],
         "unused_files": ["graph_16"],
@@ -209,7 +231,11 @@
             "h201/node_h201_0923.csv",
         ],
         "attack_to_time_window": [
-            ["h201/node_h201_0923.csv", "2019-09-23 11:23:00", "2019-09-23 13:25:00"],
+            [
+                "h201/node_h201_0923.csv",
+                "2019-09-23 11:23:00",
+                "2019-09-23 13:25:00",
+            ],
         ],
     },
     "optc_h501": {
@@ -228,7 +254,11 @@
             "h501/node_h501_0924.csv",
         ],
         "attack_to_time_window": [
-            ["h501/node_h501_0924.csv", "2019-09-24 10:28:00", "2019-09-24 15:29:00"],
+            [
+                "h501/node_h501_0924.csv",
+                "2019-09-24 10:28:00",
+                "2019-09-24 15:29:00",
+            ],
         ],
     },
     "optc_h051": {
@@ -247,7 +277,11 @@
             "h051/node_h051_0925.csv",
         ],
         "attack_to_time_window": [
-            ["h051/node_h051_0925.csv", "2019-09-25 10:29:00", "2019-09-25 14:25:00"],
+            [
+                "h051/node_h051_0925.csv",
+                "2019-09-25 10:29:00",
+                "2019-09-25 14:25:00",
+            ],
         ],
     },
 }
@@ -401,7 +435,8 @@ def __init__(self, type, vals: list = None, desc: str = None):
                                 output size matching the downstream objective (e.g. edge type prediction involves predicting 10 edge types, so the output of the decoder should be 10).",
         ),
         "src_dst_projection_coef": Arg(
-            int, desc="Multiplier of input neurons to project src and dst nodes."
+            int,
+            desc="Multiplier of input neurons to project src and dst nodes.",
         ),
     },
     "node_mlp": {
@@ -437,7 +472,9 @@ def __init__(self, type, vals: list = None, desc: str = None):
     # Prediction-based
     "predict_edge_type": {
         "decoder": Arg(
-            str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss."
+            str,
+            vals=OR(list(DECODERS_CFG.keys())),
+            desc="Decoder used before computing loss.",
         ),
         **DECODERS_CFG,
         "balanced_loss": Arg(bool),
@@ -445,7 +482,9 @@ def __init__(self, type, vals: list = None, desc: str = None):
     },
     "predict_node_type": {
         "decoder": Arg(
-            str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss."
+            str,
+            vals=OR(list(DECODERS_CFG.keys())),
+            desc="Decoder used before computing loss.",
         ),
         **DECODERS_CFG,
         "balanced_loss": Arg(bool),
@@ -453,20 +492,26 @@ def __init__(self, type, vals: list = None, desc: str = None):
     "predict_masked_struct": {
         "loss": Arg(str, vals=OR(PRED_LOSSES)),
         "decoder": Arg(
-            str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss."
+            str,
+            vals=OR(list(DECODERS_CFG.keys())),
+            desc="Decoder used before computing loss.",
         ),
         **DECODERS_CFG,
         "balanced_loss": Arg(bool),
     },
     "detect_edge_few_shot": {
         "decoder": Arg(
-            str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss."
+            str,
+            vals=OR(list(DECODERS_CFG.keys())),
+            desc="Decoder used before computing loss.",
         ),
         **DECODERS_CFG,
     },
     "predict_edge_contrastive": {
         "decoder": Arg(
-            str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss."
+            str,
+            vals=OR(list(DECODERS_CFG.keys())),
+            desc="Decoder used before computing loss.",
         ),
         **DECODERS_CFG,
         "inner_product": {
@@ -477,21 +522,27 @@ def __init__(self, type, vals: list = None, desc: str = None):
     "reconstruct_node_features": {
         "loss": Arg(str, vals=OR(RECON_LOSSES)),
         "decoder": Arg(
-            str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss."
+            str,
+            vals=OR(list(DECODERS_CFG.keys())),
+            desc="Decoder used before computing loss.",
         ),
         **DECODERS_CFG,
     },
     "reconstruct_node_embeddings": {
         "loss": Arg(str, vals=OR(RECON_LOSSES)),
         "decoder": Arg(
-            str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss."
+            str,
+            vals=OR(list(DECODERS_CFG.keys())),
+            desc="Decoder used before computing loss.",
         ),
         **DECODERS_CFG,
     },
     "reconstruct_edge_embeddings": {
         "loss": Arg(str, vals=OR(RECON_LOSSES)),
         "decoder": Arg(
-            str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss."
+            str,
+            vals=OR(list(DECODERS_CFG.keys())),
+            desc="Decoder used before computing loss.",
         ),
         **DECODERS_CFG,
     },
@@ -499,7 +550,9 @@ def __init__(self, type, vals: list = None, desc: str = None):
         "loss": Arg(str, vals=OR(RECON_LOSSES)),
         "mask_rate": Arg(float),
         "decoder": Arg(
-            str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss."
+            str,
+            vals=OR(list(DECODERS_CFG.keys())),
+            desc="Decoder used before computing loss.",
         ),
         **DECODERS_CFG,
     },
@@ -514,14 +567,23 @@ def __init__(self, type, vals: list = None, desc: str = None):
     },
 }
 
-THRESHOLD_METHODS = ["max_val_loss", "mean_val_loss", "threatrace", "magic", "flash", "nodlink"]
+THRESHOLD_METHODS = [
+    "max_val_loss",
+    "mean_val_loss",
+    "threatrace",
+    "magic",
+    "flash",
+    "nodlink",
+]
 
 # --- Tasks, subtasks, and argument configurations ---
 TASK_ARGS = {
     "preprocessing": {
         "build_graphs": {
             "used_method": Arg(
-                str, vals=OR(["default", "magic"]), desc="The method to build time window graphs."
+                str,
+                vals=OR(["default", "magic"]),
+                desc="The method to build time window graphs.",
             ),
             "use_all_files": Arg(bool),
             "mimicry_edge_num": Arg(int),
@@ -531,7 +593,8 @@ def __init__(self, type, vals: list = None, desc: str = None):
             ),
             "use_hashed_label": Arg(bool, desc="Whether to hash the textual features."),
             "fuse_edge": Arg(
-                bool, desc="Whether to fuse duplicate sequential edges into a single edge."
+                bool,
+                desc="Whether to fuse duplicate sequential edges into a single edge.",
             ),
             "node_label_features": {
                 "subject": Arg(
@@ -541,17 +604,15 @@ def __init__(self, type, vals: list = None, desc: str = None):
                 ),
                 "file": Arg(
                     str,
-                    vals=AND(
-                        ["type", "path"],
-                        desc="Which features use for file nodes. Features will be concatenated.",
-                    ),
+                    vals=AND(["type", "path"]),
+                    desc="Which features use for file nodes. Features will be concatenated.",
                 ),
                 "netflow": Arg(
                     str,
                     vals=AND(
                         ["type", "remote_ip", "remote_port"],
-                        desc="Which features use for netflow nodes. Features will be concatenated.",
                     ),
+                    desc="Which features use for netflow nodes. Features will be concatenated.",
                 ),
             },
             "multi_dataset": Arg(
@@ -582,7 +643,8 @@ def __init__(self, type, vals: list = None, desc: str = None):
                 desc="Size of the text embedding. Arg not used by some featurization methods that do not build embeddings.",
             ),
             "epochs": Arg(
-                int, desc="Epochs to train the embedding method. Arg not used by some methods."
+                int,
+                desc="Epochs to train the embedding method. Arg not used by some methods.",
             ),
             "use_seed": Arg(bool),
             "training_split": Arg(
@@ -621,13 +683,22 @@ def __init__(self, type, vals: list = None, desc: str = None):
             ),
             "edge_features": Arg(
                 str,
-                vals=AND(["edge_type", "edge_type_triplet", "msg", "time_encoding", "none"]),
+                vals=AND(
+                    [
+                        "edge_type",
+                        "edge_type_triplet",
+                        "msg",
+                        "time_encoding",
+                        "none",
+                    ]
+                ),
                 desc="Edge features to used during GNN training. `edge_type` refers to the system call type, `edge_type_triplet` \
                                     considers a same edge type as a new type if source or destination node types are different, `msg` is the message vector \
                                     used in the TGN, `time_encoding` encodes temporal order of events with their timestamps in the TGN, `none` uses no features.",
             ),
             "multi_dataset_training": Arg(
-                bool, desc="Whether the GNN should be trained on all datasets in `multi_dataset`."
+                bool,
+                desc="Whether the GNN should be trained on all datasets in `multi_dataset`.",
             ),
             "fix_buggy_graph_reindexer": Arg(
                 bool,
@@ -670,7 +741,8 @@ def __init__(self, type, vals: list = None, desc: str = None):
                 },
                 "tgn_last_neighbor": {
                     "tgn_neighbor_size": Arg(
-                        int, desc="Number of last neighbors to store for each node."
+                        int,
+                        desc="Number of last neighbors to store for each node.",
                     ),
                     "tgn_neighbor_n_hop": Arg(
                         int,
@@ -711,22 +783,29 @@ def __init__(self, type, vals: list = None, desc: str = None):
         },
         "gnn_training": {
             "use_seed": Arg(bool),
-            "deterministic": Arg(bool, desc="Whether to force PyTorch to use deterministic algorithms."),
+            "deterministic": Arg(
+                bool,
+                desc="Whether to force PyTorch to use deterministic algorithms.",
+            ),
             "num_epochs": Arg(int),
             "patience": Arg(int),
             "lr": Arg(float),
             "weight_decay": Arg(float),
-            "node_hid_dim": Arg(int, desc="Number of neurons in the middle layers of the encoder."),
+            "node_hid_dim": Arg(
+                int,
+                desc="Number of neurons in the middle layers of the encoder.",
+            ),
             "node_out_dim": Arg(int, desc="Number of neurons in the last layer of the encoder."),
             "grad_accumulation": Arg(
-                int, desc="Number of epochs to gather gradients before backprop."
+                int,
+                desc="Number of epochs to gather gradients before backprop.",
             ),
             "inference_device": Arg(
-                str, vals=OR(["cpu", "cuda"]), desc="Device used during testing."
-            ),
-            "used_method": Arg(
-                str, vals=OR(["default"]), desc="Which training pipeline use."
+                str,
+                vals=OR(["cpu", "cuda"]),
+                desc="Device used during testing.",
             ),
+            "used_method": Arg(str, vals=OR(["default"]), desc="Which training pipeline use."),
             "encoder": {
                 "dropout": Arg(float),
                 "used_methods": Arg(
@@ -782,10 +861,12 @@ def __init__(self, type, vals: list = None, desc: str = None):
                     desc="Whether to consider the loss of destination nodes when computing the node-level scores (maximum loss of a node).",
                 ),
                 "use_kmeans": Arg(
-                    bool, desc="Whether to cluster nodes after thresholding as done in Orthrus"
+                    bool,
+                    desc="Whether to cluster nodes after thresholding as done in Orthrus",
                 ),
                 "kmeans_top_K": Arg(
-                    int, desc="Number of top-score nodes selected before clustering."
+                    int,
+                    desc="Number of top-score nodes selected before clustering.",
                 ),
             },
             "tw_evaluation": {
@@ -841,7 +922,16 @@ def __init__(self, type, vals: list = None, desc: str = None):
             ),
             "depimpact": {
                 "used_method": Arg(
-                    str, vals=OR(["component", "shortest_path", "1-hop", "2-hop", "3-hop"])
+                    str,
+                    vals=OR(
+                        [
+                            "component",
+                            "shortest_path",
+                            "1-hop",
+                            "2-hop",
+                            "3-hop",
+                        ]
+                    ),
                 ),
                 "score_method": Arg(str, vals=OR(["degree", "recon_loss", "degree_recon"])),
                 "workers": Arg(int),

From e25789f85dace9337fd07e097fe18edbceb0ce3f Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Tue, 1 Jul 2025 15:30:50 +0200
Subject: [PATCH 05/33] minor formatting changes

---
 Makefile | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/Makefile b/Makefile
index cc32c1b5..bb8ab71c 100644
--- a/Makefile
+++ b/Makefile
@@ -52,13 +52,13 @@ full-setup: up load-dumps
 
 help:
 	@echo "Available commands:"
-	@echo "  postgres-up     - Start PostgreSQL"
-	@echo "  postgres-down   - Stop PostgreSQL"
-	@echo "  postgres-status - Check PostgreSQL status"
-	@echo "  postgres-logs   - Show PostgreSQL logs"
-	@echo "  postgres-load-dumps - Load database dumps"
-	@echo "  postgres-full-setup - Start PostgreSQL and load dumps"
-	@echo "  postgres-clean  - Stop and remove all data"
-	@echo "  postgres-reset  - Clean and restart PostgreSQL"
-	@echo "  app-build       - Build PIDSMaker container"
-	@echo "  app-run         - Run PIDSMaker with PostgreSQL"
+	@echo "  up     	- Start PostgreSQL"
+	@echo "  down   	- Stop PostgreSQL"
+	@echo "  status 	- Check PostgreSQL status"
+	@echo "  logs   	- Show PostgreSQL logs"
+	@echo "  load-dumps 	- Load database dumps"
+	@echo "  full-setup 	- Start PostgreSQL and load dumps"
+	@echo "  clean  	- Stop and remove all data"
+	@echo "  reset  	- Clean and restart PostgreSQL"
+	@echo "  app-build	- Build PIDSMaker container"
+	@echo "  app-run	- Run PIDSMaker with PostgreSQL"

From e6e07ff7af07c3c06a4e8099552607c1b39c0dab Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Wed, 23 Jul 2025 10:38:10 +0200
Subject: [PATCH 06/33] throw errors instead of continuing without any warning

---
 .../evaluation_methods/evaluation_utils.py      | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/pidsmaker/detection/evaluation_methods/evaluation_utils.py b/pidsmaker/detection/evaluation_methods/evaluation_utils.py
index af055268..a83522ef 100644
--- a/pidsmaker/detection/evaluation_methods/evaluation_utils.py
+++ b/pidsmaker/detection/evaluation_methods/evaluation_utils.py
@@ -36,11 +36,9 @@
 
 
 def classifier_evaluation(y_test, y_test_pred, scores):
-    labels_exist = sum(y_test) > 0
-    if labels_exist:
-        tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
-    else:
-        tn, fp, fn, tp = 1, 1, 1, 1  # only to not break tests
+    if not sum(y_test) > 0:
+        raise ValueError("Cannot evaluate: no positive labels in test set")
+    tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
 
     eps = 1e-12
     fpr = fp / (fp + tn + eps)
@@ -51,15 +49,18 @@ def classifier_evaluation(y_test, y_test_pred, scores):
 
     try:
         auc_val = roc_auc_score(y_test, scores)
-    except:
+    except ValueError as e:
+        log(f"WARNING: AUC calculation failed: {e}")
         auc_val = float("nan")
     try:
         ap = ap_score(y_test, scores)
-    except:
+    except ValueError as e:
+        log(f"WARNING: AP calculation failed: {e}")
         ap = float("nan")
     try:
         balanced_acc = balanced_accuracy_score(y_test, y_test_pred)
-    except:
+    except ValueError as e:
+        log(f"WARNING: Balanced ACC calculation failed: {e}")
         balanced_acc = float("nan")
 
     sensitivity = tp / (tp + fn + eps)

From fd11e5a4bd68f69ae3b8901da4b1d79da720ae88 Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Wed, 23 Jul 2025 10:39:26 +0200
Subject: [PATCH 07/33] fixed 'stats["percent_detected_attacks"]' calculation

---
 .../evaluation_methods/node_evaluation.py     | 135 +++++++++++++-----
 1 file changed, 103 insertions(+), 32 deletions(-)

diff --git a/pidsmaker/detection/evaluation_methods/node_evaluation.py b/pidsmaker/detection/evaluation_methods/node_evaluation.py
index 665d5e03..a8b85702 100644
--- a/pidsmaker/detection/evaluation_methods/node_evaluation.py
+++ b/pidsmaker/detection/evaluation_methods/node_evaluation.py
@@ -37,7 +37,7 @@ def get_node_predictions(val_tw_path, test_tw_path, cfg, **kwargs):
     log(f"Loading data from {test_tw_path}...")
 
     threshold_method = cfg.detection.evaluation.node_evaluation.threshold_method
-    if threshold_method == "magic":
+    if threshold_method == "magic": # data leaking by using test data
         thr = get_threshold(test_tw_path, threshold_method)
     else:
         thr = get_threshold(val_tw_path, threshold_method)
@@ -48,7 +48,9 @@ def get_node_predictions(val_tw_path, test_tw_path, cfg, **kwargs):
     node_to_max_loss = defaultdict(int)
 
     filelist = listdir_sorted(test_tw_path)
-    for tw, file in enumerate(log_tqdm(sorted(filelist), desc="Compute labels")):
+    for tw, file in enumerate(
+        log_tqdm(sorted(filelist), desc="Compute labels")
+    ):
         file = os.path.join(test_tw_path, file)
         df = pd.read_csv(file).to_dict(orient="records")
         for line in df:
@@ -72,7 +74,9 @@ def get_node_predictions(val_tw_path, test_tw_path, cfg, **kwargs):
 
     # For plotting the scores of seen and unseen nodes
     graph_dir = cfg.preprocessing.transformation._graphs_dir
-    train_set_paths = get_all_files_from_folders(graph_dir, cfg.dataset.train_files)
+    train_set_paths = get_all_files_from_folders(
+        graph_dir, cfg.dataset.train_files
+    )
 
     train_node_set = set()
     for train_path in train_set_paths:
@@ -87,7 +91,9 @@ def get_node_predictions(val_tw_path, test_tw_path, cfg, **kwargs):
         )
 
         results[node_id]["score"] = pred_score
-        results[node_id]["tw_with_max_loss"] = node_to_max_loss_tw.get(node_id, -1)
+        results[node_id]["tw_with_max_loss"] = node_to_max_loss_tw.get(
+            node_id, -1
+        )
         results[node_id]["y_true"] = int(node_id in ground_truth_nids)
         results[node_id]["is_seen"] = int(str(node_id) in train_node_set)
 
@@ -98,7 +104,8 @@ def get_node_predictions(val_tw_path, test_tw_path, cfg, **kwargs):
 
     if use_kmeans:
         results = compute_kmeans_labels(
-            results, topk_K=cfg.detection.evaluation.node_evaluation.kmeans_top_K
+            results,
+            topk_K=cfg.detection.evaluation.node_evaluation.kmeans_top_K,
         )
     return results, thr
 
@@ -119,7 +126,9 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs):
     node_to_max_loss = defaultdict(int)
 
     filelist = listdir_sorted(test_tw_path)
-    for tw, file in enumerate(log_tqdm(sorted(filelist), desc="Compute labels")):
+    for tw, file in enumerate(
+        log_tqdm(sorted(filelist), desc="Compute labels")
+    ):
         file = os.path.join(test_tw_path, file)
         df = pd.read_csv(file).to_dict(orient="records")
         for line in df:
@@ -130,9 +139,13 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs):
             node_to_values[node]["tw"].append(tw)
 
             if "threatrace_score" in line:
-                node_to_values[node]["threatrace_score"].append(line["threatrace_score"])
+                node_to_values[node]["threatrace_score"].append(
+                    line["threatrace_score"]
+                )
             if "correct_pred" in line:
-                node_to_values[node]["correct_pred"].append(line["correct_pred"])
+                node_to_values[node]["correct_pred"].append(
+                    line["correct_pred"]
+                )
             if "flash_score" in line:
                 node_to_values[node]["flash_score"].append(line["flash_score"])
             if "magic_score" in line:
@@ -144,7 +157,9 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs):
 
     # For plotting the scores of seen and unseen nodes
     graph_dir = cfg.preprocessing.transformation._graphs_dir
-    train_set_paths = get_all_files_from_folders(graph_dir, cfg.dataset.train_files)
+    train_set_paths = get_all_files_from_folders(
+        graph_dir, cfg.dataset.train_files
+    )
 
     train_node_set = set()
     for train_path in train_set_paths:
@@ -157,7 +172,10 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs):
         threatrace_label = 0
         flash_label = 0
         detected_tw = None
-        if cfg.detection.evaluation.node_evaluation.threshold_method == "threatrace":
+        if (
+            cfg.detection.evaluation.node_evaluation.threshold_method
+            == "threatrace"
+        ):
             max_score = 0
             pred_score = max(losses["threatrace_score"])
 
@@ -169,7 +187,9 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs):
                     max_score = score
                     detected_tw = tw
 
-        elif cfg.detection.evaluation.node_evaluation.threshold_method == "flash":
+        elif (
+            cfg.detection.evaluation.node_evaluation.threshold_method == "flash"
+        ):
             max_score = 0
             pred_score = max(losses["flash_score"])
 
@@ -181,7 +201,9 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs):
                     max_score = score
                     detected_tw = tw
 
-        elif cfg.detection.evaluation.node_evaluation.threshold_method == "magic":
+        elif (
+            cfg.detection.evaluation.node_evaluation.threshold_method == "magic"
+        ):
             max_score = 0
             pred_score = max(losses["magic_score"])
 
@@ -193,11 +215,14 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs):
 
         else:
             pred_score = reduce_losses_to_score(
-                losses["loss"], cfg.detection.evaluation.node_evaluation.threshold_method
+                losses["loss"],
+                cfg.detection.evaluation.node_evaluation.threshold_method,
             )
 
         results[node_id]["score"] = pred_score
-        results[node_id]["tw_with_max_loss"] = node_to_max_loss_tw.get(node_id, -1)
+        results[node_id]["tw_with_max_loss"] = node_to_max_loss_tw.get(
+            node_id, -1
+        )
         results[node_id]["y_true"] = int(node_id in ground_truth_nids)
         results[node_id]["is_seen"] = int(str(node_id) in train_node_set)
 
@@ -205,7 +230,8 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs):
         detected_tw = detected_tw or node_to_max_loss_tw.get(node_id, None)
         if detected_tw is not None:
             results[node_id]["time_range"] = [
-                datetime_to_ns_time_US_handle_nano(tw) for tw in filelist[detected_tw].split("~")
+                datetime_to_ns_time_US_handle_nano(tw)
+                for tw in filelist[detected_tw].split("~")
             ]
         else:
             results[node_id]["time_range"] = None
@@ -213,16 +239,23 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs):
         if use_kmeans:  # in this mode, we add the label after
             results[node_id]["y_hat"] = 0
         else:
-            if cfg.detection.evaluation.node_evaluation.threshold_method == "threatrace":
+            if (
+                cfg.detection.evaluation.node_evaluation.threshold_method
+                == "threatrace"
+            ):
                 results[node_id]["y_hat"] = threatrace_label
-            elif cfg.detection.evaluation.node_evaluation.threshold_method == "flash":
+            elif (
+                cfg.detection.evaluation.node_evaluation.threshold_method
+                == "flash"
+            ):
                 results[node_id]["y_hat"] = flash_label
             else:
                 results[node_id]["y_hat"] = int(pred_score > thr)
 
     if use_kmeans:
         results = compute_kmeans_labels(
-            results, topk_K=cfg.detection.evaluation.node_evaluation.kmeans_top_K
+            results,
+            topk_K=cfg.detection.evaluation.node_evaluation.kmeans_top_K,
         )
     return results, thr
 
@@ -230,7 +263,11 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs):
 def analyze_false_positives(
     y_truth, y_preds, pred_scores, max_val_loss_tw, nodes, tw_to_malicious_nodes
 ):
-    fp_indices = [i for i, (true, pred) in enumerate(zip(y_truth, y_preds)) if pred and not true]
+    fp_indices = [
+        i
+        for i, (true, pred) in enumerate(zip(y_truth, y_preds))
+        if pred and not true
+    ]
     malicious_tws = set(tw_to_malicious_nodes.keys())
     num_fps_in_malicious_tw = 0
 
@@ -239,22 +276,35 @@ def analyze_false_positives(
         num_fps_in_malicious_tw += int(is_in_malicious_tw)
 
     fp_in_malicious_tw_ratio = (
-        num_fps_in_malicious_tw / len(fp_indices) if len(fp_indices) > 0 else float("nan")
+        num_fps_in_malicious_tw / len(fp_indices)
+        if len(fp_indices) > 0
+        else float("nan")
     )
     return fp_in_malicious_tw_ratio
 
 
-def main(val_tw_path, test_tw_path, model_epoch_dir, cfg, tw_to_malicious_nodes, **kwargs):
+def main(
+    val_tw_path,
+    test_tw_path,
+    model_epoch_dir,
+    cfg,
+    tw_to_malicious_nodes,
+    **kwargs,
+):
     if cfg._is_node_level:
         get_preds_fn = get_node_predictions_node_level
     else:
         get_preds_fn = get_node_predictions
 
-    results, thr = get_preds_fn(cfg=cfg, val_tw_path=val_tw_path, test_tw_path=test_tw_path)
+    results, thr = get_preds_fn(
+        cfg=cfg, val_tw_path=val_tw_path, test_tw_path=test_tw_path
+    )
 
     # save results for future checking
     os.makedirs(cfg.detection.evaluation._results_dir, exist_ok=True)
-    results_save_dir = os.path.join(cfg.detection.evaluation._results_dir, "results.pth")
+    results_save_dir = os.path.join(
+        cfg.detection.evaluation._results_dir, "results.pth"
+    )
     torch.save(results, results_save_dir)
     log(f"Resutls saved to {results_save_dir}")
 
@@ -268,9 +318,15 @@ def main(val_tw_path, test_tw_path, model_epoch_dir, cfg, tw_to_malicious_nodes,
     )  # average detection precision
     scores_img_file = os.path.join(out_dir, f"scores_{model_epoch_dir}.png")
     # simple_scores_img_file = os.path.join(out_dir, f"simple_scores_{model_epoch_dir}.png")
-    neat_scores_img_file = os.path.join(out_dir, f"neat_scores_{model_epoch_dir}.svg")
-    seen_score_img_file = os.path.join(out_dir, f"seen_score_{model_epoch_dir}.png")
-    discrim_img_file = os.path.join(out_dir, f"discrim_curve_{model_epoch_dir}.png")
+    neat_scores_img_file = os.path.join(
+        out_dir, f"neat_scores_{model_epoch_dir}.svg"
+    )
+    seen_score_img_file = os.path.join(
+        out_dir, f"seen_score_{model_epoch_dir}.png"
+    )
+    discrim_img_file = os.path.join(
+        out_dir, f"discrim_curve_{model_epoch_dir}.png"
+    )
 
     attack_to_GPs = get_GP_of_each_attack(cfg)
     attack_to_TPs = defaultdict(int)
@@ -314,9 +370,13 @@ def main(val_tw_path, test_tw_path, model_epoch_dir, cfg, tw_to_malicious_nodes,
     adp_score = plot_detected_attacks_vs_precision(
         pred_scores, nodes, node2attacks, y_truth, adp_img_file
     )
-    discrim_scores = compute_discrimination_score(pred_scores, nodes, node2attacks, y_truth)
+    discrim_scores = compute_discrimination_score(
+        pred_scores, nodes, node2attacks, y_truth
+    )
     plot_discrimination_metric(pred_scores, y_truth, discrim_img_file)
-    discrim_tp = compute_discrimination_tp(pred_scores, nodes, node2attacks, y_truth)
+    discrim_tp = compute_discrimination_tp(
+        pred_scores, nodes, node2attacks, y_truth
+    )
     # plot_simple_scores(pred_scores, y_truth, simple_scores_img_file)
     plot_scores_with_paths_node_level(
         pred_scores,
@@ -329,12 +389,19 @@ def main(val_tw_path, test_tw_path, model_epoch_dir, cfg, tw_to_malicious_nodes,
         cfg,
         thr,
     )
-    plot_scores_neat(pred_scores, y_truth, nodes, node2attacks, neat_scores_img_file, thr)
+    plot_scores_neat(
+        pred_scores, y_truth, nodes, node2attacks, neat_scores_img_file, thr
+    )
     # plot_score_seen(pred_scores, is_seen, seen_score_img_file)
     stats = classifier_evaluation(y_truth, y_preds, pred_scores)
 
     fp_in_malicious_tw_ratio = analyze_false_positives(
-        y_truth, y_preds, pred_scores, max_val_loss_tw, nodes, tw_to_malicious_nodes
+        y_truth,
+        y_preds,
+        pred_scores,
+        max_val_loss_tw,
+        nodes,
+        tw_to_malicious_nodes,
     )
     stats["fp_in_malicious_tw_ratio"] = round(fp_in_malicious_tw_ratio, 3)
 
@@ -345,7 +412,9 @@ def main(val_tw_path, test_tw_path, model_epoch_dir, cfg, tw_to_malicious_nodes,
         tps_in_atts.append((att, tps))
 
     stats["percent_detected_attacks"] = (
-        round(len(attack_to_GPs) / len(attack_to_TPs), 2) if len(attack_to_TPs) > 0 else 0
+        round(len(attack_to_TPs) / len(attack_to_GPs), 2)
+        if len(attack_to_GPs) > 0
+        else 0
     )
 
     fps, tps, precision, recall = get_metrics_if_all_attacks_detected(
@@ -361,7 +430,9 @@ def main(val_tw_path, test_tw_path, model_epoch_dir, cfg, tw_to_malicious_nodes,
     for k, v in discrim_scores.items():
         stats[k] = round(v, 4)
 
-    attack2tps = get_detected_tps_node_level(pred_scores, nodes, node2attacks, y_truth, cfg)
+    attack2tps = get_detected_tps_node_level(
+        pred_scores, nodes, node2attacks, y_truth, cfg
+    )
     for attack, detected_tps in attack2tps.items():
         stats[f"tps_{attack}"] = str(detected_tps)
 

From 4a88a32c8c3c710c1651fc184800a998870ad153 Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Wed, 23 Jul 2025 10:40:09 +0200
Subject: [PATCH 08/33] window size in ns not in sec

---
 .../preprocessing/build_graph_methods/build_default_graphs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pidsmaker/preprocessing/build_graph_methods/build_default_graphs.py b/pidsmaker/preprocessing/build_graph_methods/build_default_graphs.py
index 06dae428..784d1c42 100644
--- a/pidsmaker/preprocessing/build_graph_methods/build_default_graphs.py
+++ b/pidsmaker/preprocessing/build_graph_methods/build_default_graphs.py
@@ -262,7 +262,7 @@ def get_batches(arr, batch_size):
             start_time = events_list[0][-2]
             temp_list = []
             BATCH = 1024
-            window_size_in_sec = cfg.preprocessing.build_graphs.time_window_size * 60_000_000_000
+            window_size_in_ns = cfg.preprocessing.build_graphs.time_window_size * 60_000_000_000
 
             last_batch = False
             for batch_edges in get_batches(events_list, BATCH):
@@ -272,7 +272,7 @@ def get_batches(arr, batch_size):
                 if (len(batch_edges) < BATCH) or (temp_list[-1] == events_list[-1]):
                     last_batch = True
 
-                if (batch_edges[-1][-2] > start_time + window_size_in_sec) or last_batch:
+                if (batch_edges[-1][-2] > start_time + window_size_in_ns) or last_batch:
                     time_interval = (
                         ns_time_to_datetime_US(start_time)
                         + "~"

From 5f9d7497b121fb03336258c20f28fb8efd60f84c Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Wed, 30 Jul 2025 15:33:12 +0200
Subject: [PATCH 09/33] updated detected_attacks when new attack is detected

---
 pidsmaker/detection/evaluation_methods/evaluation_utils.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/pidsmaker/detection/evaluation_methods/evaluation_utils.py b/pidsmaker/detection/evaluation_methods/evaluation_utils.py
index a83522ef..947a0f86 100644
--- a/pidsmaker/detection/evaluation_methods/evaluation_utils.py
+++ b/pidsmaker/detection/evaluation_methods/evaluation_utils.py
@@ -660,13 +660,12 @@ def plot_detected_attacks_vs_precision(scores, nodes, node2attacks, labels, out_
         # Update tp and fp based on label
         if sorted_labels[i] == 1:
             tp += 1
+            # Update detected attacks set if node has associated attacks
+            if node in node2attacks:
+                detected_attacks.update(node2attacks[node])
         else:
             fp += 1
 
-        # Update detected attacks set if node has associated attacks
-        if node in node2attacks:
-            detected_attacks.update(node2attacks[node])
-
         # Calculate precision and detected attacks percentage
         precision = tp / (tp + fp)
         detected_attacks_percentage = (len(detected_attacks) / total_attacks) * 100

From a89ad7b2af3e05b927f0d6dc52076892913622f1 Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Mon, 4 Aug 2025 16:03:49 +0200
Subject: [PATCH 10/33] use specific command line arguments to configure the
 database

---
 compose-postgres.yml                |  2 +-
 docs/docs/create-db-from-scratch.md | 10 +++++++---
 pidsmaker/config/pipeline.py        | 30 +++++++++++++++++------------
 3 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/compose-postgres.yml b/compose-postgres.yml
index 5553d5ef..fe074d44 100644
--- a/compose-postgres.yml
+++ b/compose-postgres.yml
@@ -12,7 +12,7 @@ services:
     volumes:
       - postgres_data:/var/lib/postgresql/data
       - ./postgres/init-create-empty-databases.sh:/docker-entrypoint-initdb.d/init-create-empty-databases.sh
-      - ./settings/scripts:/scripts
+      - ./scripts:/scripts
       - ${INPUT_DIR:-/data}:/data
     healthcheck:
       test: ["CMD-SHELL", "pg_isready -U postgres"]
diff --git a/docs/docs/create-db-from-scratch.md b/docs/docs/create-db-from-scratch.md
index 7bfb6bd3..d17e8da1 100644
--- a/docs/docs/create-db-from-scratch.md
+++ b/docs/docs/create-db-from-scratch.md
@@ -7,7 +7,7 @@ You can download all required files directly by running:
 pip install gdown
 ```
 ```shell
-./settings/scripts/download_{dataset}.sh {data_folder}
+./scripts/download_{dataset}.sh {data_folder}
 ```
 where `{dataset}` can be either `clearscope_e3`, `cadets_e3`, `theia_e3`, `clearscope_e5`, `cadets_e5` or `theia_e5` and `{data_folder}` is the absolute path to the output folder where all raw files will be downloaded.
 
@@ -26,14 +26,18 @@ sudo docker compose exec pids bash
 
 4. Convert the DARPA files 
 ```shell
-./settings/scripts/uncompress_darpa_files.sh /data/
+./scripts/uncompress_darpa_files.sh /data/
 ```
 
 > [!NOTE]  
 > This may take multiple hours depending on the dataset.
 
 ### Optional configurations
-- optionally, if using a specific postgres database instead of the postgres docker, update the connection config by setting `DATABASE_DEFAULT_CONFIG` within `pidsmaker/config.py`.
+- optionally, if using a specific postgres database instead of the postgres docker, pass the details as command line arguments to the python scripts
+  - `--database_host`: the host machine where the database is located (default: `postgres`)
+  - `--database_user`: the database user to connect to the database (default: `postgres`)
+  - `--database_password`: the password for the database user (default: `postgres`)
+  - `--database_port`: the port number for Postgres (default: `5432`)
 
 - optionaly, if you want to change the output folder where generated files are stored, update accordingly the volume by uncommenting `./artifacts:/home/artifacts` in `compose.yml`.
 
diff --git a/pidsmaker/config/pipeline.py b/pidsmaker/config/pipeline.py
index f51aafa3..6e646c0f 100644
--- a/pidsmaker/config/pipeline.py
+++ b/pidsmaker/config/pipeline.py
@@ -24,17 +24,9 @@
     Arg,
 )
 
-DEFAULT_ROOT_ARTIFACT_DIR = "./artifacts/"  # Destination folder (in the container) for generated files. Will be created if doesn't exist.
 ROOT_PROJECT_PATH = pathlib.Path(__file__).parent.parent.parent.resolve()
 ROOT_GROUND_TRUTH_DIR = os.path.join(ROOT_PROJECT_PATH, "Ground_Truth/")
 
-
-DATABASE_DEFAULT_CONFIG = {
-    "host": "localhost",  # Host machine where the db is located
-    "user": "postgres",  # Database user
-    "password": "postgres",  # The password to the database user
-    "port": "5432",  # The port number for Postgres
-}
 # ================================================================================
 
 
@@ -43,7 +35,7 @@ def get_default_cfg(args):
     Inits the shared cfg object with default configurations.
     """
     cfg = CN()
-    cfg._artifact_dir = args.artifact_dir_in_container or DEFAULT_ROOT_ARTIFACT_DIR
+    cfg._artifact_dir = args.artifact_dir
 
     cfg._test_mode = args.test_mode
     cfg._debug = not args.wandb
@@ -64,8 +56,10 @@ def get_default_cfg(args):
 
     # Database: we simply create variables for all configurations described in the dict
     cfg.database = CN()
-    for attr, value in DATABASE_DEFAULT_CONFIG.items():
-        setattr(cfg.database, attr, value)
+    cfg.database.host = args.database_host
+    cfg.database.user = args.database_user
+    cfg.database.password = args.database_password
+    cfg.database.port = args.database_port
 
     # Dataset: we simply create variables for all configurations described in the dict
     set_dataset_cfg(cfg, args.dataset)
@@ -139,9 +133,21 @@ def get_runtime_required_args(return_unknown_args=False, args=None):
     parser.add_argument(
         "--tuning_file_path", default="", help="If set, use the given YML path for tuning"
     )
+    parser.add_argument(
+        "--database_host", default="postgres", help="Host machine where the db is located"
+    )
+    parser.add_argument(
+        "--database_user", default="postgres", help="Database user to connect to the database"
+    )
+    parser.add_argument(
+        "--database_password", default="postgres", help="The password to the database user"
+    )
+    parser.add_argument(
+        "--database_port", default="5432", help="The port number for Postgres (default: 5432)"
+    )
     parser.add_argument("--sweep_id", default="", help="ID of a wandb sweep for multi-agent runs")
     parser.add_argument(
-        "--artifact_dir_in_container", default="", help="ID of a wandb sweep for multi-agent runs"
+        "--artifact_dir", default="./artifacts/", help="Destination folder for generated files"
     )
     parser.add_argument(
         "--test_mode",

From 0ad5a09c746d1a420ec13692c5c57188eab9d756 Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Mon, 4 Aug 2025 16:08:17 +0200
Subject: [PATCH 11/33] addition of scripts to setup a Singularity postgres
 container for cluster environments. Moving all scripts to the ./scripts/
 folder and deletion of ./settings/scripts/ folder

---
 Makefile => scripts/Makefile                            | 4 ++--
 {settings/scripts => scripts}/create_database.sh        | 0
 {settings/scripts => scripts}/download_cadets_e3.sh     | 0
 {settings/scripts => scripts}/download_cadets_e5.sh     | 0
 {settings/scripts => scripts}/download_clearscope_e3.sh | 0
 {settings/scripts => scripts}/download_clearscope_e5.sh | 0
 {settings/scripts => scripts}/download_theia_e3.sh      | 0
 {settings/scripts => scripts}/download_theia_e5.sh      | 0
 {settings/scripts => scripts}/e3_tools.sh               | 0
 {settings/scripts => scripts}/load_dumps.sh             | 0
 postgres-start.sh => scripts/postgres-start.sh          | 6 +++---
 postgres-status.sh => scripts/postgres-status.sh        | 0
 postgres-stop.sh => scripts/postgres-stop.sh            | 0
 {settings/scripts => scripts}/uncompress_darpa_files.sh | 0
 14 files changed, 5 insertions(+), 5 deletions(-)
 rename Makefile => scripts/Makefile (93%)
 rename {settings/scripts => scripts}/create_database.sh (100%)
 rename {settings/scripts => scripts}/download_cadets_e3.sh (100%)
 rename {settings/scripts => scripts}/download_cadets_e5.sh (100%)
 rename {settings/scripts => scripts}/download_clearscope_e3.sh (100%)
 rename {settings/scripts => scripts}/download_clearscope_e5.sh (100%)
 rename {settings/scripts => scripts}/download_theia_e3.sh (100%)
 rename {settings/scripts => scripts}/download_theia_e5.sh (100%)
 rename {settings/scripts => scripts}/e3_tools.sh (100%)
 rename {settings/scripts => scripts}/load_dumps.sh (100%)
 rename postgres-start.sh => scripts/postgres-start.sh (95%)
 rename postgres-status.sh => scripts/postgres-status.sh (100%)
 rename postgres-stop.sh => scripts/postgres-stop.sh (100%)
 rename {settings/scripts => scripts}/uncompress_darpa_files.sh (100%)

diff --git a/Makefile b/scripts/Makefile
similarity index 93%
rename from Makefile
rename to scripts/Makefile
index bb8ab71c..b395519b 100644
--- a/Makefile
+++ b/scripts/Makefile
@@ -39,11 +39,11 @@ app-run: up
 
 load-dumps: up
 	@echo "Loading database dumps from inside container..."
-	@if [ -f "./settings/scripts/load_dumps.sh" ]; then \
+	@if [ -f "./load_dumps.sh" ]; then \
 		echo "Found load_dumps.sh, executing inside container..."; \
 		singularity exec instance://postgres_instance /scripts/load_dumps.sh; \
 	    else \
-		echo "Error: ./settings/scripts/load_dumps.sh not found"; \
+		echo "Error: ./load_dumps.sh not found"; \
 		exit 1; \
 	    fi
 
diff --git a/settings/scripts/create_database.sh b/scripts/create_database.sh
similarity index 100%
rename from settings/scripts/create_database.sh
rename to scripts/create_database.sh
diff --git a/settings/scripts/download_cadets_e3.sh b/scripts/download_cadets_e3.sh
similarity index 100%
rename from settings/scripts/download_cadets_e3.sh
rename to scripts/download_cadets_e3.sh
diff --git a/settings/scripts/download_cadets_e5.sh b/scripts/download_cadets_e5.sh
similarity index 100%
rename from settings/scripts/download_cadets_e5.sh
rename to scripts/download_cadets_e5.sh
diff --git a/settings/scripts/download_clearscope_e3.sh b/scripts/download_clearscope_e3.sh
similarity index 100%
rename from settings/scripts/download_clearscope_e3.sh
rename to scripts/download_clearscope_e3.sh
diff --git a/settings/scripts/download_clearscope_e5.sh b/scripts/download_clearscope_e5.sh
similarity index 100%
rename from settings/scripts/download_clearscope_e5.sh
rename to scripts/download_clearscope_e5.sh
diff --git a/settings/scripts/download_theia_e3.sh b/scripts/download_theia_e3.sh
similarity index 100%
rename from settings/scripts/download_theia_e3.sh
rename to scripts/download_theia_e3.sh
diff --git a/settings/scripts/download_theia_e5.sh b/scripts/download_theia_e5.sh
similarity index 100%
rename from settings/scripts/download_theia_e5.sh
rename to scripts/download_theia_e5.sh
diff --git a/settings/scripts/e3_tools.sh b/scripts/e3_tools.sh
similarity index 100%
rename from settings/scripts/e3_tools.sh
rename to scripts/e3_tools.sh
diff --git a/settings/scripts/load_dumps.sh b/scripts/load_dumps.sh
similarity index 100%
rename from settings/scripts/load_dumps.sh
rename to scripts/load_dumps.sh
diff --git a/postgres-start.sh b/scripts/postgres-start.sh
similarity index 95%
rename from postgres-start.sh
rename to scripts/postgres-start.sh
index 1a5eb78b..7d00cd8e 100755
--- a/postgres-start.sh
+++ b/scripts/postgres-start.sh
@@ -74,10 +74,10 @@ else
     echo -e "${YELLOW}Warning: ./postgres/init-create-empty-databases.sh not found, skipping${NC}"
 fi
 
-if [ -d "./settings/scripts" ]; then
-    BIND_MOUNTS="$BIND_MOUNTS --bind ./settings/scripts:/scripts"
+if [ -d "./scripts" ]; then
+    BIND_MOUNTS="$BIND_MOUNTS --bind ./scripts:/scripts"
 else
-    echo -e "${YELLOW}Warning: ./settings/scripts directory not found, skipping${NC}"
+    echo -e "${YELLOW}Warning: ./scripts directory not found, skipping${NC}"
 fi
 
 # Always bind INPUT_DIR
diff --git a/postgres-status.sh b/scripts/postgres-status.sh
similarity index 100%
rename from postgres-status.sh
rename to scripts/postgres-status.sh
diff --git a/postgres-stop.sh b/scripts/postgres-stop.sh
similarity index 100%
rename from postgres-stop.sh
rename to scripts/postgres-stop.sh
diff --git a/settings/scripts/uncompress_darpa_files.sh b/scripts/uncompress_darpa_files.sh
similarity index 100%
rename from settings/scripts/uncompress_darpa_files.sh
rename to scripts/uncompress_darpa_files.sh

From 6c211fbd358501a2deedb877a6291bd6b5e27284 Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Mon, 4 Aug 2025 16:44:25 +0200
Subject: [PATCH 12/33] Removal of separate calls to set_seed for each task.
 The first call sets the seed for the rest of the execution and avoids a seed
 reset to the default  value. The value was previously always set to 0 and no
 other seeds were allowed by the code. Finally,
 torch.use_deterministic_algorithms automatically overwrites
 torch.backends.cudnn.deterministic and torch.manual_seed already calls
 internally torch.cuca.manual_seed_all

---
 config/flash.yml                              |  2 --
 config/kairos.yml                             |  1 -
 config/magic.yml                              |  1 -
 config/nodlink.yml                            |  2 --
 config/orthrus.yml                            |  2 --
 config/rcaid.yml                              |  2 --
 config/threatrace.yml                         |  1 -
 config/tuned_components/tuned_alacarte.yml    |  1 -
 config/tuned_components/tuned_doc2vec.yml     |  1 -
 config/tuned_components/tuned_fasttext.yml    |  1 -
 config/tuned_components/tuned_flash.yml       |  1 -
 config/tuned_components/tuned_word2vec.yml    |  1 -
 docs/scripts/args/args_detection.md           |  1 -
 docs/scripts/args/args_featurization.md       |  1 -
 environment.yaml                              |  1 +
 pidsmaker/config/config.py                    |  2 --
 pidsmaker/config/pipeline.py                  |  1 +
 pidsmaker/detection/graph_preprocessing.py    |  3 +-
 .../training_methods/inference_loop.py        |  3 --
 .../training_methods/training_loop.py         |  4 +--
 pidsmaker/featurization/feat_training.py      |  4 ---
 .../feat_training_alacarte.py                 | 36 ++++++-------------
 .../feat_training_doc2vec.py                  |  3 +-
 .../feat_training_fasttext.py                 |  4 +--
 .../feat_training_trw.py                      |  4 +--
 .../feat_training_word2vec.py                 |  4 +--
 pidsmaker/preprocessing/transformation.py     |  2 --
 pidsmaker/utils/utils.py                      | 17 ++++-----
 scripts/run.sh                                |  2 +-
 29 files changed, 27 insertions(+), 81 deletions(-)

diff --git a/config/flash.yml b/config/flash.yml
index 42c8e340..303fa479 100644
--- a/config/flash.yml
+++ b/config/flash.yml
@@ -20,7 +20,6 @@ featurization:
     epochs: 10 # 300
     training_split: train
     used_method: flash
-    use_seed: True
     flash:
       min_count: 1
       workers: 15
@@ -41,7 +40,6 @@ detection:
       used_method: none
   gnn_training:
     used_method: default
-    use_seed: True
     deterministic: False
     num_epochs: 12
     patience: 3
diff --git a/config/kairos.yml b/config/kairos.yml
index e21386ea..e1131495 100644
--- a/config/kairos.yml
+++ b/config/kairos.yml
@@ -45,7 +45,6 @@ detection:
       used_method: none
   gnn_training:
     used_method: default
-    use_seed: True
     deterministic: False
     num_epochs: 12
     patience: 3
diff --git a/config/magic.yml b/config/magic.yml
index d857f350..6b3fc6c2 100644
--- a/config/magic.yml
+++ b/config/magic.yml
@@ -34,7 +34,6 @@ detection:
       used_method: none
   gnn_training:
     used_method: default
-    use_seed: True
     deterministic: False
     num_epochs: 12
     patience: 3
diff --git a/config/nodlink.yml b/config/nodlink.yml
index 046ee648..52cacd9d 100644
--- a/config/nodlink.yml
+++ b/config/nodlink.yml
@@ -18,7 +18,6 @@ featurization:
   feat_training:
     epochs: 100
     emb_dim: 256
-    use_seed: True
     training_split: train
     used_method: fasttext
     fasttext:
@@ -45,7 +44,6 @@ detection:
       used_method: none
   gnn_training:
     used_method: default
-    use_seed: True
     deterministic: False
     num_epochs: 12
     patience: 3
diff --git a/config/orthrus.yml b/config/orthrus.yml
index 22506f89..3b3d045a 100644
--- a/config/orthrus.yml
+++ b/config/orthrus.yml
@@ -21,7 +21,6 @@ featurization:
     emb_dim: 128
     epochs: 50
     training_split: all
-    use_seed: True
     used_method: word2vec
     word2vec:
       alpha: 0.025
@@ -58,7 +57,6 @@ detection:
       used_method: none
   gnn_training:
     used_method: default
-    use_seed: True
     deterministic: False
     num_epochs: 12
     patience: 3
diff --git a/config/rcaid.yml b/config/rcaid.yml
index aff722c0..dff311d1 100644
--- a/config/rcaid.yml
+++ b/config/rcaid.yml
@@ -20,7 +20,6 @@ featurization:
   feat_training:
     epochs: 5
     emb_dim: 128
-    use_seed: True
     training_split: all
     used_method: doc2vec
     doc2vec:
@@ -43,7 +42,6 @@ detection:
       used_method: none
   gnn_training:
     used_method: default
-    use_seed: True
     deterministic: False
     num_epochs: 12
     patience: 3
diff --git a/config/threatrace.yml b/config/threatrace.yml
index 6d064fe0..809267ab 100644
--- a/config/threatrace.yml
+++ b/config/threatrace.yml
@@ -34,7 +34,6 @@ detection:
       used_method: none
   gnn_training:
     used_method: default
-    use_seed: True
     deterministic: False
     num_epochs: 12
     patience: 3
diff --git a/config/tuned_components/tuned_alacarte.yml b/config/tuned_components/tuned_alacarte.yml
index 958e1002..2048c55a 100644
--- a/config/tuned_components/tuned_alacarte.yml
+++ b/config/tuned_components/tuned_alacarte.yml
@@ -2,7 +2,6 @@ featurization:
   feat_training:
     epochs: 10
     emb_dim: 128
-    use_seed: True
     training_split: train
     used_method: alacarte
     alacarte:
diff --git a/config/tuned_components/tuned_doc2vec.yml b/config/tuned_components/tuned_doc2vec.yml
index 4824ddec..93d4d8dd 100644
--- a/config/tuned_components/tuned_doc2vec.yml
+++ b/config/tuned_components/tuned_doc2vec.yml
@@ -2,7 +2,6 @@ featurization:
   feat_training:
     epochs: 10
     emb_dim: 128
-    use_seed: True
     training_split: train
     used_method: doc2vec
     doc2vec:
diff --git a/config/tuned_components/tuned_fasttext.yml b/config/tuned_components/tuned_fasttext.yml
index feb351fe..0e2a1b32 100644
--- a/config/tuned_components/tuned_fasttext.yml
+++ b/config/tuned_components/tuned_fasttext.yml
@@ -2,7 +2,6 @@ featurization:
   feat_training:
     epochs: 10
     emb_dim: 256
-    use_seed: True
     training_split: train
     used_method: fasttext
     fasttext:
diff --git a/config/tuned_components/tuned_flash.yml b/config/tuned_components/tuned_flash.yml
index 01d1b57e..7c058644 100644
--- a/config/tuned_components/tuned_flash.yml
+++ b/config/tuned_components/tuned_flash.yml
@@ -4,7 +4,6 @@ featurization:
     epochs: 10
     training_split: train
     used_method: flash
-    use_seed: True
     flash:
       min_count: 1
       workers: 10
diff --git a/config/tuned_components/tuned_word2vec.yml b/config/tuned_components/tuned_word2vec.yml
index 5f38ae1f..b8bb8696 100644
--- a/config/tuned_components/tuned_word2vec.yml
+++ b/config/tuned_components/tuned_word2vec.yml
@@ -3,7 +3,6 @@ featurization:
     epochs: 10
     emb_dim: 128
     training_split: train
-    use_seed: True
     used_method: word2vec
     word2vec:
       alpha: 0.025
diff --git a/docs/scripts/args/args_detection.md b/docs/scripts/args/args_detection.md
index 2f888bd0..676ba1cf 100644
--- a/docs/scripts/args/args_detection.md
+++ b/docs/scripts/args/args_detection.md
@@ -45,7 +45,6 @@
     </li>
     <li class='bullet'><span class="key">gnn_training</span>
     <ul>
-        <li class='no-bullet'><span class="key-leaf">use_seed</span>: <span class="value">bool</span></li>
         <li class='no-bullet'><span class="key-leaf">deterministic</span>: <span class="value">bool (19)</span></li>
         <li class='no-bullet'><span class="key-leaf">num_epochs</span>: <span class="value">int</span></li>
         <li class='no-bullet'><span class="key-leaf">patience</span>: <span class="value">int</span></li>
diff --git a/docs/scripts/args/args_featurization.md b/docs/scripts/args/args_featurization.md
index c404f12a..5bd8a81c 100644
--- a/docs/scripts/args/args_featurization.md
+++ b/docs/scripts/args/args_featurization.md
@@ -5,7 +5,6 @@
     <ul>
         <li class='no-bullet'><span class="key-leaf">emb_dim</span>: <span class="value">int (1)</span></li>
         <li class='no-bullet'><span class="key-leaf">epochs</span>: <span class="value">int (2)</span></li>
-        <li class='no-bullet'><span class="key-leaf">use_seed</span>: <span class="value">bool</span></li>
         <li class='no-bullet'><span class="key-leaf">training_split</span>: <span class="value">str (3)</span></li>
         <li class='no-bullet'><span class="key-leaf">multi_dataset_training</span>: <span class="value">bool (4)</span></li>
         <li class='no-bullet'><span class="key-leaf">used_method</span>: <span class="value">str (5)</span></li>
diff --git a/environment.yaml b/environment.yaml
index 7d0678d1..5563e2ce 100644
--- a/environment.yaml
+++ b/environment.yaml
@@ -88,6 +88,7 @@ dependencies:
       - gitdb==4.0.12
       - gitpython==3.1.44
       - graphviz==0.20.1
+      - h5py==3.14.0
       - identify==2.6.12
       - idna==3.10
       - igraph==0.11.5
diff --git a/pidsmaker/config/config.py b/pidsmaker/config/config.py
index 8c356c87..aa2f8d1d 100644
--- a/pidsmaker/config/config.py
+++ b/pidsmaker/config/config.py
@@ -646,7 +646,6 @@ def __init__(self, type, vals: list = None, desc: str = None):
                 int,
                 desc="Epochs to train the embedding method. Arg not used by some methods.",
             ),
-            "use_seed": Arg(bool),
             "training_split": Arg(
                 str,
                 vals=OR(["train", "all"]),
@@ -782,7 +781,6 @@ def __init__(self, type, vals: list = None, desc: str = None):
             },
         },
         "gnn_training": {
-            "use_seed": Arg(bool),
             "deterministic": Arg(
                 bool,
                 desc="Whether to force PyTorch to use deterministic algorithms.",
diff --git a/pidsmaker/config/pipeline.py b/pidsmaker/config/pipeline.py
index 6e646c0f..4d6291c6 100644
--- a/pidsmaker/config/pipeline.py
+++ b/pidsmaker/config/pipeline.py
@@ -146,6 +146,7 @@ def get_runtime_required_args(return_unknown_args=False, args=None):
         "--database_port", default="5432", help="The port number for Postgres (default: 5432)"
     )
     parser.add_argument("--sweep_id", default="", help="ID of a wandb sweep for multi-agent runs")
+    parser.add_argument("--seed", default=0, help="Random seed for reproducibility")
     parser.add_argument(
         "--artifact_dir", default="./artifacts/", help="Destination folder for generated files"
     )
diff --git a/pidsmaker/detection/graph_preprocessing.py b/pidsmaker/detection/graph_preprocessing.py
index 26ba214a..cc31954f 100644
--- a/pidsmaker/detection/graph_preprocessing.py
+++ b/pidsmaker/detection/graph_preprocessing.py
@@ -3,7 +3,7 @@
 import torch
 
 from pidsmaker.utils.data_utils import load_all_datasets
-from pidsmaker.utils.utils import get_device, log, log_start, set_seed
+from pidsmaker.utils.utils import get_device, log, log_start
 
 
 def get_preprocessed_graphs(cfg):
@@ -22,7 +22,6 @@ def get_preprocessed_graphs(cfg):
 
 
 def main(cfg):
-    set_seed(cfg)
     log_start(__file__)
 
     if cfg.detection.graph_preprocessing.save_on_disk:
diff --git a/pidsmaker/detection/training_methods/inference_loop.py b/pidsmaker/detection/training_methods/inference_loop.py
index 0847a97e..b67142c6 100644
--- a/pidsmaker/detection/training_methods/inference_loop.py
+++ b/pidsmaker/detection/training_methods/inference_loop.py
@@ -15,7 +15,6 @@
     log,
     log_tqdm,
     ns_time_to_datetime_US,
-    set_seed,
 )
 
 
@@ -264,8 +263,6 @@ def test_node_level(
 
 
 def main(cfg, model, val_data, test_data, epoch, split, logging=True):
-    set_seed(cfg)
-
     if split == "all":
         splits = [(val_data, "val"), (test_data, "test")]
     elif split == "val":
diff --git a/pidsmaker/detection/training_methods/training_loop.py b/pidsmaker/detection/training_methods/training_loop.py
index 06129ea5..8464e1b8 100644
--- a/pidsmaker/detection/training_methods/training_loop.py
+++ b/pidsmaker/detection/training_methods/training_loop.py
@@ -12,14 +12,12 @@
     optimizer_factory,
     optimizer_few_shot_factory,
 )
-from pidsmaker.utils.utils import get_device, log, log_start, log_tqdm, set_seed
+from pidsmaker.utils.utils import get_device, log, log_start, log_tqdm
 
 from . import inference_loop
 
 
 def main(cfg):
-    set_seed(cfg)
-
     log_start(__file__)
     device = get_device(cfg)
     use_cuda = device == torch.device("cuda")
diff --git a/pidsmaker/featurization/feat_training.py b/pidsmaker/featurization/feat_training.py
index c10374b8..deb145ab 100644
--- a/pidsmaker/featurization/feat_training.py
+++ b/pidsmaker/featurization/feat_training.py
@@ -1,5 +1,3 @@
-from pidsmaker.utils.utils import set_seed
-
 from .feat_training_methods import (
     build_trw,
     feat_training_alacarte,
@@ -13,8 +11,6 @@
 
 
 def main(cfg):
-    set_seed(cfg)
-
     method = cfg.featurization.feat_training.used_method.strip()
     if method == "alacarte":
         build_random_walks.main(cfg)
diff --git a/pidsmaker/featurization/feat_training_methods/feat_training_alacarte.py b/pidsmaker/featurization/feat_training_methods/feat_training_alacarte.py
index ad8b4d69..b427b729 100644
--- a/pidsmaker/featurization/feat_training_methods/feat_training_alacarte.py
+++ b/pidsmaker/featurization/feat_training_methods/feat_training_alacarte.py
@@ -444,8 +444,6 @@ def feat_training_for_one_split(
     num_workers = cfg.featurization.feat_training.alacarte.num_workers
     compute_loss = cfg.featurization.feat_training.alacarte.compute_loss
     add_paths = cfg.featurization.feat_training.alacarte.add_paths
-    use_seed = cfg.featurization.feat_training.use_seed
-    SEED = 0
 
     log_dir = out_dir
 
@@ -485,29 +483,17 @@ def feat_training_for_one_split(
     # Training using Word2Vec if needed
     # ===-----------------------------------------------------------------------===
     if model_input is None:
-        if use_seed:
-            model = Word2Vec(
-                paths,
-                vector_size=emb_dim,
-                window=window_size,
-                min_count=min_count,
-                sg=use_skip_gram,
-                workers=num_workers,
-                epochs=epochs,
-                compute_loss=compute_loss,
-                seed=SEED,
-            )
-        else:
-            model = Word2Vec(
-                paths,
-                vector_size=emb_dim,
-                window=window_size,
-                min_count=min_count,
-                sg=use_skip_gram,
-                workers=num_workers,
-                epochs=epochs,
-                compute_loss=compute_loss,
-            )
+        model = Word2Vec(
+            paths,
+            vector_size=emb_dim,
+            window=window_size,
+            min_count=min_count,
+            sg=use_skip_gram,
+            workers=num_workers,
+            epochs=epochs,
+            compute_loss=compute_loss,
+            seed=cfg.seed,
+        )
     else:
         log("Loading existing model from: {}".format(model_input))
         model = Word2Vec.load(model_input)
diff --git a/pidsmaker/featurization/feat_training_methods/feat_training_doc2vec.py b/pidsmaker/featurization/feat_training_methods/feat_training_doc2vec.py
index 2d0b35c3..3fb67883 100644
--- a/pidsmaker/featurization/feat_training_methods/feat_training_doc2vec.py
+++ b/pidsmaker/featurization/feat_training_methods/feat_training_doc2vec.py
@@ -18,9 +18,8 @@ def doc2vec(
     alpha: float,
     dm: int = 1,
 ):
-    SEED = 0
     model = Doc2Vec(
-        vector_size=emb_dim, alpha=alpha, min_count=1, dm=dm, compute_loss=True, seed=SEED
+        vector_size=emb_dim, alpha=alpha, min_count=1, dm=dm, compute_loss=True, seed=cfg.seed
     )
     model.build_vocab(tagged_data)
 
diff --git a/pidsmaker/featurization/feat_training_methods/feat_training_fasttext.py b/pidsmaker/featurization/feat_training_methods/feat_training_fasttext.py
index 07575efa..b368bd98 100644
--- a/pidsmaker/featurization/feat_training_methods/feat_training_fasttext.py
+++ b/pidsmaker/featurization/feat_training_methods/feat_training_fasttext.py
@@ -23,8 +23,6 @@ def train_fasttext(corpus, cfg):
     min_count = cfg.featurization.feat_training.fasttext.min_count
     num_workers = cfg.featurization.feat_training.fasttext.num_workers
     negative = cfg.featurization.feat_training.fasttext.negative
-    use_seed = cfg.featurization.feat_training.use_seed
-    SEED = 0
 
     use_pretrained_fb_model = cfg.featurization.feat_training.fasttext.use_pretrained_fb_model
 
@@ -44,7 +42,7 @@ def train_fasttext(corpus, cfg):
             alpha=alpha,
             window=window_size,
             negative=negative,
-            seed=SEED,
+            seed=cfg.seed,
         )
 
     model.build_vocab(corpus, update=use_pretrained_fb_model)
diff --git a/pidsmaker/featurization/feat_training_methods/feat_training_trw.py b/pidsmaker/featurization/feat_training_methods/feat_training_trw.py
index d8ba7dc0..e5a45f6f 100644
--- a/pidsmaker/featurization/feat_training_methods/feat_training_trw.py
+++ b/pidsmaker/featurization/feat_training_methods/feat_training_trw.py
@@ -41,8 +41,6 @@ def train_word2vec(corpus, model_save_path, cfg):
     epochs = cfg.featurization.feat_training.epochs
     compute_loss = cfg.featurization.feat_training.temporal_rw.compute_loss
     negative = cfg.featurization.feat_training.temporal_rw.negative
-    use_seed = cfg.featurization.feat_training.use_seed
-    SEED = 0
 
     model = Word2Vec(
         corpus,
@@ -54,7 +52,7 @@ def train_word2vec(corpus, model_save_path, cfg):
         epochs=1,
         compute_loss=compute_loss,
         negative=negative,
-        seed=SEED,
+        seed=cfg.seed,
     )
 
     epoch_loss = model.get_latest_training_loss()
diff --git a/pidsmaker/featurization/feat_training_methods/feat_training_word2vec.py b/pidsmaker/featurization/feat_training_methods/feat_training_word2vec.py
index 42216b79..08c7a751 100644
--- a/pidsmaker/featurization/feat_training_methods/feat_training_word2vec.py
+++ b/pidsmaker/featurization/feat_training_methods/feat_training_word2vec.py
@@ -16,8 +16,6 @@ def train_word2vec(corpus, cfg, model_save_path):
     epochs = cfg.featurization.feat_training.epochs
     compute_loss = cfg.featurization.feat_training.word2vec.compute_loss
     negative = cfg.featurization.feat_training.word2vec.negative
-    use_seed = cfg.featurization.feat_training.use_seed
-    SEED = 0
 
     model = Word2Vec(
         corpus,
@@ -30,7 +28,7 @@ def train_word2vec(corpus, cfg, model_save_path):
         epochs=1,
         compute_loss=compute_loss,
         negative=negative,
-        seed=SEED,
+        seed=cfg.seed,
     )
 
     epoch_loss = model.get_latest_training_loss()
diff --git a/pidsmaker/preprocessing/transformation.py b/pidsmaker/preprocessing/transformation.py
index a6d7d43d..ccf2bb7d 100644
--- a/pidsmaker/preprocessing/transformation.py
+++ b/pidsmaker/preprocessing/transformation.py
@@ -16,7 +16,6 @@
     load_graphs_for_days,
     log_start,
     log_tqdm,
-    set_seed,
 )
 
 
@@ -110,7 +109,6 @@ def main_from_config(cfg):
 
 
 def main(cfg):
-    set_seed(cfg)
     log_start(__file__)
 
     multi_datasets = get_multi_datasets(cfg)
diff --git a/pidsmaker/utils/utils.py b/pidsmaker/utils/utils.py
index 731b90ce..8932f4a7 100644
--- a/pidsmaker/utils/utils.py
+++ b/pidsmaker/utils/utils.py
@@ -652,16 +652,13 @@ def log_helper(label, dataset):
 
 
 def set_seed(cfg):
-    if cfg.detection.gnn_training.use_seed:
-        seed = 0
-        random.seed(seed)
-        np.random.seed(seed)
-
-        torch.manual_seed(seed)
-        torch.cuda.manual_seed_all(seed)
-        torch.backends.cudnn.deterministic = True
-        torch.backends.cudnn.benchmark = False
-        
+    seed = cfg.seed
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.backends.cudnn.benchmark = False
+
     if cfg.detection.gnn_training.deterministic:
         torch.use_deterministic_algorithms(True, warn_only=True)
 
diff --git a/scripts/run.sh b/scripts/run.sh
index a9a8c5af..4d2ddc55 100755
--- a/scripts/run.sh
+++ b/scripts/run.sh
@@ -15,4 +15,4 @@ for arg in "$@"; do
 done
 
 # Execute the Python script with the passed arguments
-PYTHONHASHSEED=0 nohup python ../pidsmaker/main.py $args --wandb &
+nohup python ../pidsmaker/main.py $args --wandb &

From f478cc02e51aa673363f770a80a97877f5349213 Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Mon, 4 Aug 2025 17:04:44 +0200
Subject: [PATCH 13/33] Minor fixes

---
 .gitignore                   |  1 +
 pidsmaker/config/pipeline.py |  5 +++--
 pidsmaker/main.py            | 13 ++++---------
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index a84235d0..86a7b927 100644
--- a/.gitignore
+++ b/.gitignore
@@ -179,3 +179,4 @@ postgres_lock/
 # Postgres directories
 postgres_config/
 postgres_run/
+postgres_log/
diff --git a/pidsmaker/config/pipeline.py b/pidsmaker/config/pipeline.py
index 4d6291c6..7dffc84c 100644
--- a/pidsmaker/config/pipeline.py
+++ b/pidsmaker/config/pipeline.py
@@ -106,7 +106,7 @@ def get_runtime_required_args(return_unknown_args=False, args=None):
     )
     parser.add_argument("--wandb", action="store_true", help="Whether to submit logs to wandb")
     parser.add_argument(
-        "--project", type=str, default="", help="Name of the wandb project (optional)"
+        "--project", type=str, default="PIDSMaker", help="Name of the wandb project"
     )
     parser.add_argument("--exp", type=str, default="", help="Name of the experiment")
     parser.add_argument(
@@ -457,7 +457,8 @@ def get_yml_cfg(args):
 
     # Inits with default configurations
     cfg = get_default_cfg(args)
-
+    # Set seed for reproducibility
+    cfg.seed = args.seed
     # Checks that all configurations are valid and merge yml file to cfg
     yml_file = get_yml_file(args.model)
     merge_cfg_and_check_syntax(cfg, yml_file)
diff --git a/pidsmaker/main.py b/pidsmaker/main.py
index 561c59c3..4e961710 100644
--- a/pidsmaker/main.py
+++ b/pidsmaker/main.py
@@ -276,25 +276,20 @@ def run_pipeline_from_sweep(cfg):
     )
     tags = args.tags.split(",") if args.tags != "" else [args.model]
 
-    if args.project != "":
-        project = args.project
-    else:
-        project = "PIDSMaker"
+    cfg = get_yml_cfg(args)
 
     wandb.init(
         mode=("online" if (args.wandb and args.tuning_mode == "none") else "disabled"),
-        project=project,
+        project=args.project,
         name=exp_name,
         tags=tags,
+        config=clean_cfg_for_log(cfg),
     )
 
     if len(unknown_args) > 0:
         raise argparse.ArgumentTypeError(f"Unknown args {unknown_args}")
 
-    cfg = get_yml_cfg(args)
-    wandb.config.update(clean_cfg_for_log(cfg))
-
-    main(cfg, project=project, exp=exp_name, sweep_id=args.sweep_id)
+    main(cfg, project=args.project, exp=exp_name, sweep_id=args.sweep_id)
 
     wandb.finish()
 

From 8ee45ed69c00c341b9d59f7412ed8a87d6371e8f Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Mon, 4 Aug 2025 17:26:10 +0200
Subject: [PATCH 14/33] added README for running using Singularity

---
 docs/docs/docs/docs/introduction.md          |  0
 docs/docs/singularity_install.md             | 39 ++++++++++++++++++++
 environment.yaml => scripts/environment.yaml |  0
 scripts/load_dumps.sh                        | 16 +++-----
 4 files changed, 44 insertions(+), 11 deletions(-)
 create mode 100644 docs/docs/docs/docs/introduction.md
 create mode 100644 docs/docs/singularity_install.md
 rename environment.yaml => scripts/environment.yaml (100%)

diff --git a/docs/docs/docs/docs/introduction.md b/docs/docs/docs/docs/introduction.md
new file mode 100644
index 00000000..e69de29b
diff --git a/docs/docs/singularity_install.md b/docs/docs/singularity_install.md
new file mode 100644
index 00000000..c54fb620
--- /dev/null
+++ b/docs/docs/singularity_install.md
@@ -0,0 +1,39 @@
+# Install Framework using Singularity
+
+For quick installation on environments where Docker is not available (such as HPC clusters), you can use Singularity. This guide assumes Singularity is already installed on your system.
+
+## Setup Process
+
+### 1. Database Setup
+The Makefile in `./scripts/Makefile` provides easy environment setup:
+
+```bash
+make full-setup
+```
+
+This command will:
+- Download and run a PostgreSQL container through Singularity
+- Load database dumps by executing the `load_dumps.sh` script
+
+### 2. Container Management
+Once the database is ready:
+- Stop the container: `make down`
+- Start it again: `make up`
+
+### 3. Dependencies Installation
+Install all required dependencies using conda:
+
+```bash
+conda env create -f ./scripts/environment.yml
+conda activate pids
+```
+
+## Running the Framework
+
+Once both the database and conda environment are ready, run the framework with:
+
+```bash
+python pidsmaker/main.py SYSTEM DATASET
+```
+
+For more details, see the [introduction](introduction.md).
\ No newline at end of file
diff --git a/environment.yaml b/scripts/environment.yaml
similarity index 100%
rename from environment.yaml
rename to scripts/environment.yaml
diff --git a/scripts/load_dumps.sh b/scripts/load_dumps.sh
index b476e228..da642ee7 100755
--- a/scripts/load_dumps.sh
+++ b/scripts/load_dumps.sh
@@ -1,11 +1,10 @@
 #!/bin/bash
 
-set -e  # Exit on any error
+set -e
 
 echo "Starting database dump restoration..."
 
 for dump_file in /data/*.dump; do
-  # Check if any dump files exist
   if [ ! -f "$dump_file" ]; then
     echo "No .dump files found in /data/ directory"
     break
@@ -15,21 +14,18 @@ for dump_file in /data/*.dump; do
   
   echo "Processing $dump_file -> database '$db_name'"
   
-  # Check if database already exists and has data
   if psql -U postgres -h localhost -p 5432 -lqt | cut -d \| -f 1 | grep -qw "$db_name"; then
     echo "Database '$db_name' already exists. Checking if it has data..."
     
-    # Count tables in the database
     table_count=$(psql -U postgres -h localhost -p 5432 -d "$db_name" -t -c "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'public';" 2>/dev/null || echo "0")
     
     if [ "$table_count" -gt 0 ]; then
-      echo "✓ Database '$db_name' already has $table_count tables. Skipping restoration."
+      echo "Database '$db_name' already has $table_count tables. Skipping restoration."
       continue
     else
       echo "Database '$db_name' exists but is empty. Proceeding with restoration..."
     fi
   else
-    # Create database if it doesn't exist
     echo "Creating database '$db_name'..."
     psql -U postgres -h localhost -p 5432 -c "CREATE DATABASE \"$db_name\";" 2>/dev/null || {
       echo "Warning: Could not create database '$db_name' (may already exist)"
@@ -40,20 +36,18 @@ for dump_file in /data/*.dump; do
   
   # Use --clean --if-exists to handle existing objects gracefully
   if pg_restore -U postgres -h localhost -p 5432 --clean --if-exists --no-owner --no-privileges -d "$db_name" "$dump_file" 2>/dev/null; then
-    echo "✓ Successfully restored $dump_file"
+    echo "Successfully restored $dump_file"
     
-    # Verify restoration
     final_table_count=$(psql -U postgres -h localhost -p 5432 -d "$db_name" -t -c "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'public';" 2>/dev/null || echo "0")
     echo "  Database '$db_name' now has $final_table_count tables"
   else
-    echo "✗ Warning: pg_restore reported errors for $dump_file (this may be normal for some dump formats)"
+    echo "Warning: pg_restore reported errors for $dump_file (this may be normal for some dump formats)"
   fi
   
   echo ""
 done
 
-echo "Database dump restoration completed!"
+echo "Database dump restoration completed"
 
-# Show summary of all databases
 echo "Summary of available databases:"
 psql -U postgres -h localhost -p 5432 -c "\l" | grep -E "^\s+[a-zA-Z]" | head -20

From 326eb1f5b6877393335e7650ae7217f4f4d255e1 Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Tue, 19 Aug 2025 00:01:51 +0200
Subject: [PATCH 15/33] added comment to clarify that all code for "few_shot"
 is currently unused

---
 pidsmaker/factory.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pidsmaker/factory.py b/pidsmaker/factory.py
index d1536d23..aa0a25cd 100644
--- a/pidsmaker/factory.py
+++ b/pidsmaker/factory.py
@@ -467,6 +467,7 @@ def objective_factory(cfg, in_dim, graph_reindexer, device, objective_cfg=None):
             raise ValueError(f"Invalid objective {objective}")
 
     # We wrap objectives into this class to calculate some metrics on validation set easily
+    # This is useful only if use_few_shot is True
     is_edge_type_prediction = objective_cfg.used_methods.strip() == "predict_edge_type"
     objectives = [
         ValidationWrapper(

From 5e162b11652c8ddaa968e98833b85a12c8af6cfc Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Tue, 19 Aug 2025 11:43:00 +0200
Subject: [PATCH 16/33] Removed useless code: edge_list is always None, so it's
 always redefined as edge_df

---
 pidsmaker/detection/training_methods/inference_loop.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/pidsmaker/detection/training_methods/inference_loop.py b/pidsmaker/detection/training_methods/inference_loop.py
index b67142c6..1a0ece38 100644
--- a/pidsmaker/detection/training_methods/inference_loop.py
+++ b/pidsmaker/detection/training_methods/inference_loop.py
@@ -29,7 +29,6 @@ def test_edge_level(
 ):
     model.eval()
 
-    edge_list = None
     start_time = data.t[0]
     all_losses = []
 
@@ -62,21 +61,17 @@ def test_edge_level(
             "edge_type": edge_types.astype(int),
         }
     )
-    if edge_list is None:
-        edge_list = edge_df
-    else:
-        edge_list = pd.concat([edge_list, edge_df])
 
     # Here is a checkpoint, which records all edge losses in the current time window
     time_interval = (
-        ns_time_to_datetime_US(start_time) + "~" + ns_time_to_datetime_US(edge_list["time"].max())
+        ns_time_to_datetime_US(start_time) + "~" + ns_time_to_datetime_US(edge_df["time"].max())
     )
 
     logs_dir = os.path.join(cfg.detection.gnn_training._edge_losses_dir, split, model_epoch_file)
     os.makedirs(logs_dir, exist_ok=True)
     csv_file = os.path.join(logs_dir, time_interval + ".csv")
 
-    edge_list.to_csv(csv_file, sep=",", header=True, index=False, encoding="utf-8")
+    edge_df.to_csv(csv_file, sep=",", header=True, index=False, encoding="utf-8")
     return all_losses
 
 

From 135fdeffda9b24c8fd33dd56a39c31c75fad480c Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Tue, 19 Aug 2025 13:39:49 +0200
Subject: [PATCH 17/33] removed unused variable

---
 pidsmaker/factory.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pidsmaker/factory.py b/pidsmaker/factory.py
index aa0a25cd..cc52ec35 100644
--- a/pidsmaker/factory.py
+++ b/pidsmaker/factory.py
@@ -81,8 +81,6 @@ def encoder_factory(cfg, msg_dim, in_dim, device, max_node_num, graph_reindexer)
     if use_tgn:
         in_dim = tgn_memory_dim
 
-    original_edge_dim = edge_dim
-
     for method in map(
         lambda x: x.strip(),
         cfg.detection.gnn_training.encoder.used_methods.replace("-", ",").split(","),

From 60516c52f9a0cefd1114140984be3a04e71f4aa9 Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Thu, 21 Aug 2025 18:36:25 +0200
Subject: [PATCH 18/33] revert: set PYTHONHASHSEED before launching Python
 interpreter again

---
 pidsmaker/utils/utils.py | 1 -
 scripts/run.sh           | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/pidsmaker/utils/utils.py b/pidsmaker/utils/utils.py
index 8932f4a7..13654ded 100644
--- a/pidsmaker/utils/utils.py
+++ b/pidsmaker/utils/utils.py
@@ -653,7 +653,6 @@ def log_helper(label, dataset):
 
 def set_seed(cfg):
     seed = cfg.seed
-    os.environ["PYTHONHASHSEED"] = str(seed)
     random.seed(seed)
     np.random.seed(seed)
     torch.manual_seed(seed)
diff --git a/scripts/run.sh b/scripts/run.sh
index 4d2ddc55..4e3be119 100755
--- a/scripts/run.sh
+++ b/scripts/run.sh
@@ -15,4 +15,4 @@ for arg in "$@"; do
 done
 
 # Execute the Python script with the passed arguments
-nohup python ../pidsmaker/main.py $args --wandb &
+PYTHONHASHSEED=0 nohup python ../pidsmaker/main.py $args --wandb &
\ No newline at end of file

From 17f00d9026eab52974f30ad177fbc651f113f002 Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Wed, 27 Aug 2025 10:24:14 +0200
Subject: [PATCH 19/33] fix seed argument type

---
 pidsmaker/config/pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pidsmaker/config/pipeline.py b/pidsmaker/config/pipeline.py
index 7dffc84c..a4ce1e4f 100644
--- a/pidsmaker/config/pipeline.py
+++ b/pidsmaker/config/pipeline.py
@@ -146,7 +146,7 @@ def get_runtime_required_args(return_unknown_args=False, args=None):
         "--database_port", default="5432", help="The port number for Postgres (default: 5432)"
     )
     parser.add_argument("--sweep_id", default="", help="ID of a wandb sweep for multi-agent runs")
-    parser.add_argument("--seed", default=0, help="Random seed for reproducibility")
+    parser.add_argument("--seed", type=int, default=0, help="Random seed for reproducibility")
     parser.add_argument(
         "--artifact_dir", default="./artifacts/", help="Destination folder for generated files"
     )

From 7fbdea504e50bbd274de51727010a32073683f97 Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Wed, 27 Aug 2025 12:44:40 +0200
Subject: [PATCH 20/33] remove deprecated punkt in favor of punkt_tab and keep
 the tokenizer data in the project folder

---
 .gitignore               | 3 +++
 pidsmaker/utils/utils.py | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 86a7b927..375cde6d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -180,3 +180,6 @@ postgres_lock/
 postgres_config/
 postgres_run/
 postgres_log/
+
+# tokenizer data
+nltk_data/
diff --git a/pidsmaker/utils/utils.py b/pidsmaker/utils/utils.py
index 13654ded..ace84aad 100644
--- a/pidsmaker/utils/utils.py
+++ b/pidsmaker/utils/utils.py
@@ -18,7 +18,8 @@
 from nltk.tokenize import word_tokenize
 from tqdm import tqdm
 
-nltk.download("punkt", quiet=True)
+nltk.download("punkt_tab", quiet=True, download_dir="./nltk_data")
+nltk.data.path.append("./nltk_data")
 
 from pidsmaker.config import update_cfg_for_multi_dataset
 

From 1c7a39364f24db5a3a3c1496b6daae8dec050f5f Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Wed, 27 Aug 2025 15:59:20 +0200
Subject: [PATCH 21/33] code formatting with ruff

---
 .../evaluation_methods/node_evaluation.py     | 110 +++++-------------
 pidsmaker/tgn.py                              |   4 +-
 pidsmaker/utils/data_utils.py                 |   2 +-
 3 files changed, 31 insertions(+), 85 deletions(-)

diff --git a/pidsmaker/detection/evaluation_methods/node_evaluation.py b/pidsmaker/detection/evaluation_methods/node_evaluation.py
index a8b85702..42e21ab0 100644
--- a/pidsmaker/detection/evaluation_methods/node_evaluation.py
+++ b/pidsmaker/detection/evaluation_methods/node_evaluation.py
@@ -37,7 +37,7 @@ def get_node_predictions(val_tw_path, test_tw_path, cfg, **kwargs):
     log(f"Loading data from {test_tw_path}...")
 
     threshold_method = cfg.detection.evaluation.node_evaluation.threshold_method
-    if threshold_method == "magic": # data leaking by using test data
+    if threshold_method == "magic":  # data leaking by using test data
         thr = get_threshold(test_tw_path, threshold_method)
     else:
         thr = get_threshold(val_tw_path, threshold_method)
@@ -48,9 +48,7 @@ def get_node_predictions(val_tw_path, test_tw_path, cfg, **kwargs):
     node_to_max_loss = defaultdict(int)
 
     filelist = listdir_sorted(test_tw_path)
-    for tw, file in enumerate(
-        log_tqdm(sorted(filelist), desc="Compute labels")
-    ):
+    for tw, file in enumerate(log_tqdm(sorted(filelist), desc="Compute labels")):
         file = os.path.join(test_tw_path, file)
         df = pd.read_csv(file).to_dict(orient="records")
         for line in df:
@@ -74,9 +72,7 @@ def get_node_predictions(val_tw_path, test_tw_path, cfg, **kwargs):
 
     # For plotting the scores of seen and unseen nodes
     graph_dir = cfg.preprocessing.transformation._graphs_dir
-    train_set_paths = get_all_files_from_folders(
-        graph_dir, cfg.dataset.train_files
-    )
+    train_set_paths = get_all_files_from_folders(graph_dir, cfg.dataset.train_files)
 
     train_node_set = set()
     for train_path in train_set_paths:
@@ -91,9 +87,7 @@ def get_node_predictions(val_tw_path, test_tw_path, cfg, **kwargs):
         )
 
         results[node_id]["score"] = pred_score
-        results[node_id]["tw_with_max_loss"] = node_to_max_loss_tw.get(
-            node_id, -1
-        )
+        results[node_id]["tw_with_max_loss"] = node_to_max_loss_tw.get(node_id, -1)
         results[node_id]["y_true"] = int(node_id in ground_truth_nids)
         results[node_id]["is_seen"] = int(str(node_id) in train_node_set)
 
@@ -126,9 +120,7 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs):
     node_to_max_loss = defaultdict(int)
 
     filelist = listdir_sorted(test_tw_path)
-    for tw, file in enumerate(
-        log_tqdm(sorted(filelist), desc="Compute labels")
-    ):
+    for tw, file in enumerate(log_tqdm(sorted(filelist), desc="Compute labels")):
         file = os.path.join(test_tw_path, file)
         df = pd.read_csv(file).to_dict(orient="records")
         for line in df:
@@ -139,13 +131,9 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs):
             node_to_values[node]["tw"].append(tw)
 
             if "threatrace_score" in line:
-                node_to_values[node]["threatrace_score"].append(
-                    line["threatrace_score"]
-                )
+                node_to_values[node]["threatrace_score"].append(line["threatrace_score"])
             if "correct_pred" in line:
-                node_to_values[node]["correct_pred"].append(
-                    line["correct_pred"]
-                )
+                node_to_values[node]["correct_pred"].append(line["correct_pred"])
             if "flash_score" in line:
                 node_to_values[node]["flash_score"].append(line["flash_score"])
             if "magic_score" in line:
@@ -157,9 +145,7 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs):
 
     # For plotting the scores of seen and unseen nodes
     graph_dir = cfg.preprocessing.transformation._graphs_dir
-    train_set_paths = get_all_files_from_folders(
-        graph_dir, cfg.dataset.train_files
-    )
+    train_set_paths = get_all_files_from_folders(graph_dir, cfg.dataset.train_files)
 
     train_node_set = set()
     for train_path in train_set_paths:
@@ -172,10 +158,7 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs):
         threatrace_label = 0
         flash_label = 0
         detected_tw = None
-        if (
-            cfg.detection.evaluation.node_evaluation.threshold_method
-            == "threatrace"
-        ):
+        if cfg.detection.evaluation.node_evaluation.threshold_method == "threatrace":
             max_score = 0
             pred_score = max(losses["threatrace_score"])
 
@@ -187,9 +170,7 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs):
                     max_score = score
                     detected_tw = tw
 
-        elif (
-            cfg.detection.evaluation.node_evaluation.threshold_method == "flash"
-        ):
+        elif cfg.detection.evaluation.node_evaluation.threshold_method == "flash":
             max_score = 0
             pred_score = max(losses["flash_score"])
 
@@ -201,9 +182,7 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs):
                     max_score = score
                     detected_tw = tw
 
-        elif (
-            cfg.detection.evaluation.node_evaluation.threshold_method == "magic"
-        ):
+        elif cfg.detection.evaluation.node_evaluation.threshold_method == "magic":
             max_score = 0
             pred_score = max(losses["magic_score"])
 
@@ -220,9 +199,7 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs):
             )
 
         results[node_id]["score"] = pred_score
-        results[node_id]["tw_with_max_loss"] = node_to_max_loss_tw.get(
-            node_id, -1
-        )
+        results[node_id]["tw_with_max_loss"] = node_to_max_loss_tw.get(node_id, -1)
         results[node_id]["y_true"] = int(node_id in ground_truth_nids)
         results[node_id]["is_seen"] = int(str(node_id) in train_node_set)
 
@@ -230,8 +207,7 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs):
         detected_tw = detected_tw or node_to_max_loss_tw.get(node_id, None)
         if detected_tw is not None:
             results[node_id]["time_range"] = [
-                datetime_to_ns_time_US_handle_nano(tw)
-                for tw in filelist[detected_tw].split("~")
+                datetime_to_ns_time_US_handle_nano(tw) for tw in filelist[detected_tw].split("~")
             ]
         else:
             results[node_id]["time_range"] = None
@@ -239,15 +215,9 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs):
         if use_kmeans:  # in this mode, we add the label after
             results[node_id]["y_hat"] = 0
         else:
-            if (
-                cfg.detection.evaluation.node_evaluation.threshold_method
-                == "threatrace"
-            ):
+            if cfg.detection.evaluation.node_evaluation.threshold_method == "threatrace":
                 results[node_id]["y_hat"] = threatrace_label
-            elif (
-                cfg.detection.evaluation.node_evaluation.threshold_method
-                == "flash"
-            ):
+            elif cfg.detection.evaluation.node_evaluation.threshold_method == "flash":
                 results[node_id]["y_hat"] = flash_label
             else:
                 results[node_id]["y_hat"] = int(pred_score > thr)
@@ -263,11 +233,7 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs):
 def analyze_false_positives(
     y_truth, y_preds, pred_scores, max_val_loss_tw, nodes, tw_to_malicious_nodes
 ):
-    fp_indices = [
-        i
-        for i, (true, pred) in enumerate(zip(y_truth, y_preds))
-        if pred and not true
-    ]
+    fp_indices = [i for i, (true, pred) in enumerate(zip(y_truth, y_preds)) if pred and not true]
     malicious_tws = set(tw_to_malicious_nodes.keys())
     num_fps_in_malicious_tw = 0
 
@@ -276,9 +242,7 @@ def analyze_false_positives(
         num_fps_in_malicious_tw += int(is_in_malicious_tw)
 
     fp_in_malicious_tw_ratio = (
-        num_fps_in_malicious_tw / len(fp_indices)
-        if len(fp_indices) > 0
-        else float("nan")
+        num_fps_in_malicious_tw / len(fp_indices) if len(fp_indices) > 0 else float("nan")
     )
     return fp_in_malicious_tw_ratio
 
@@ -296,15 +260,11 @@ def main(
     else:
         get_preds_fn = get_node_predictions
 
-    results, thr = get_preds_fn(
-        cfg=cfg, val_tw_path=val_tw_path, test_tw_path=test_tw_path
-    )
+    results, thr = get_preds_fn(cfg=cfg, val_tw_path=val_tw_path, test_tw_path=test_tw_path)
 
     # save results for future checking
     os.makedirs(cfg.detection.evaluation._results_dir, exist_ok=True)
-    results_save_dir = os.path.join(
-        cfg.detection.evaluation._results_dir, "results.pth"
-    )
+    results_save_dir = os.path.join(cfg.detection.evaluation._results_dir, "results.pth")
     torch.save(results, results_save_dir)
     log(f"Resutls saved to {results_save_dir}")
 
@@ -318,15 +278,9 @@ def main(
     )  # average detection precision
     scores_img_file = os.path.join(out_dir, f"scores_{model_epoch_dir}.png")
     # simple_scores_img_file = os.path.join(out_dir, f"simple_scores_{model_epoch_dir}.png")
-    neat_scores_img_file = os.path.join(
-        out_dir, f"neat_scores_{model_epoch_dir}.svg"
-    )
-    seen_score_img_file = os.path.join(
-        out_dir, f"seen_score_{model_epoch_dir}.png"
-    )
-    discrim_img_file = os.path.join(
-        out_dir, f"discrim_curve_{model_epoch_dir}.png"
-    )
+    neat_scores_img_file = os.path.join(out_dir, f"neat_scores_{model_epoch_dir}.svg")
+    seen_score_img_file = os.path.join(out_dir, f"seen_score_{model_epoch_dir}.png")
+    discrim_img_file = os.path.join(out_dir, f"discrim_curve_{model_epoch_dir}.png")
 
     attack_to_GPs = get_GP_of_each_attack(cfg)
     attack_to_TPs = defaultdict(int)
@@ -370,13 +324,9 @@ def main(
     adp_score = plot_detected_attacks_vs_precision(
         pred_scores, nodes, node2attacks, y_truth, adp_img_file
     )
-    discrim_scores = compute_discrimination_score(
-        pred_scores, nodes, node2attacks, y_truth
-    )
+    discrim_scores = compute_discrimination_score(pred_scores, nodes, node2attacks, y_truth)
     plot_discrimination_metric(pred_scores, y_truth, discrim_img_file)
-    discrim_tp = compute_discrimination_tp(
-        pred_scores, nodes, node2attacks, y_truth
-    )
+    discrim_tp = compute_discrimination_tp(pred_scores, nodes, node2attacks, y_truth)
     # plot_simple_scores(pred_scores, y_truth, simple_scores_img_file)
     plot_scores_with_paths_node_level(
         pred_scores,
@@ -389,9 +339,7 @@ def main(
         cfg,
         thr,
     )
-    plot_scores_neat(
-        pred_scores, y_truth, nodes, node2attacks, neat_scores_img_file, thr
-    )
+    plot_scores_neat(pred_scores, y_truth, nodes, node2attacks, neat_scores_img_file, thr)
     # plot_score_seen(pred_scores, is_seen, seen_score_img_file)
     stats = classifier_evaluation(y_truth, y_preds, pred_scores)
 
@@ -412,9 +360,7 @@ def main(
         tps_in_atts.append((att, tps))
 
     stats["percent_detected_attacks"] = (
-        round(len(attack_to_TPs) / len(attack_to_GPs), 2)
-        if len(attack_to_GPs) > 0
-        else 0
+        round(len(attack_to_TPs) / len(attack_to_GPs), 2) if len(attack_to_GPs) > 0 else 0
     )
 
     fps, tps, precision, recall = get_metrics_if_all_attacks_detected(
@@ -430,9 +376,7 @@ def main(
     for k, v in discrim_scores.items():
         stats[k] = round(v, 4)
 
-    attack2tps = get_detected_tps_node_level(
-        pred_scores, nodes, node2attacks, y_truth, cfg
-    )
+    attack2tps = get_detected_tps_node_level(pred_scores, nodes, node2attacks, y_truth, cfg)
     for attack, detected_tps in attack2tps.items():
         stats[f"tps_{attack}"] = str(detected_tps)
 
diff --git a/pidsmaker/tgn.py b/pidsmaker/tgn.py
index 1436dac8..80bdf811 100644
--- a/pidsmaker/tgn.py
+++ b/pidsmaker/tgn.py
@@ -392,7 +392,9 @@ def insert(self, src: Tensor, dst: Tensor):
 
         # Compute cumulative start indices
         cum_edge_counts = torch.cat([torch.tensor([0], device=nodes.device), edge_counts.cumsum(0)])
-        local_slots = torch.arange(nodes.size(0), device=nodes.device) - cum_edge_counts[self._assoc[nodes]]
+        local_slots = (
+            torch.arange(nodes.size(0), device=nodes.device) - cum_edge_counts[self._assoc[nodes]]
+        )
         dense_id = local_slots + (self._assoc[nodes] * temp_size)
 
         # Initialize dense tensors with temporary size
diff --git a/pidsmaker/utils/data_utils.py b/pidsmaker/utils/data_utils.py
index aefa25a0..0b914a76 100644
--- a/pidsmaker/utils/data_utils.py
+++ b/pidsmaker/utils/data_utils.py
@@ -704,7 +704,7 @@ def inter_batching(dataset, method):
                 ):
                     batch = data_list[i : i + bs]
                     data = collate(CollatableTemporalData, data_list=batch)[0]
-                    
+
                     use_tgn = "tgn" in cfg.detection.gnn_training.encoder.used_methods
                     if cfg._debug and use_tgn:
                         debug_test_batching(batch, data, cfg)

From 66680ed4aa7a766ab4070cbc1dca4ec50a91d976 Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Wed, 27 Aug 2025 17:20:23 +0200
Subject: [PATCH 22/33] restore previous formatting for config.py

---
 pidsmaker/config/config.py | 160 ++++++++-----------------------------
 1 file changed, 33 insertions(+), 127 deletions(-)

diff --git a/pidsmaker/config/config.py b/pidsmaker/config/config.py
index aa2f8d1d..8ad8a567 100644
--- a/pidsmaker/config/config.py
+++ b/pidsmaker/config/config.py
@@ -81,11 +81,7 @@
             "E5-CADETS/node_Nginx_Drakon_APT_17.csv",
         ],
         "attack_to_time_window": [
-            [
-                "E5-CADETS/node_Nginx_Drakon_APT.csv",
-                "2019-05-16 09:31:00",
-                "2019-05-16 10:12:00",
-            ],
+            ["E5-CADETS/node_Nginx_Drakon_APT.csv", "2019-05-16 09:31:00", "2019-05-16 10:12:00"],
             [
                 "E5-CADETS/node_Nginx_Drakon_APT_17.csv",
                 "2019-05-17 10:15:00",
@@ -121,22 +117,10 @@
             "E3-CADETS/node_Nginx_Backdoor_13.csv",
         ],
         "attack_to_time_window": [
-            [
-                "E3-CADETS/node_Nginx_Backdoor_06.csv",
-                "2018-04-06 11:20:00",
-                "2018-04-06 12:09:00",
-            ],
+            ["E3-CADETS/node_Nginx_Backdoor_06.csv", "2018-04-06 11:20:00", "2018-04-06 12:09:00"],
             # ["E3-CADETS/node_Nginx_Backdoor_11.csv" , '2018-04-11 15:07:00', '2018-04-11 15:16:00'],
-            [
-                "E3-CADETS/node_Nginx_Backdoor_12.csv",
-                "2018-04-12 13:59:00",
-                "2018-04-12 14:39:00",
-            ],
-            [
-                "E3-CADETS/node_Nginx_Backdoor_13.csv",
-                "2018-04-13 09:03:00",
-                "2018-04-13 09:16:00",
-            ],
+            ["E3-CADETS/node_Nginx_Backdoor_12.csv", "2018-04-12 13:59:00", "2018-04-12 14:39:00"],
+            ["E3-CADETS/node_Nginx_Backdoor_13.csv", "2018-04-13 09:03:00", "2018-04-13 09:16:00"],
         ],
     },
     "CLEARSCOPE_E5": {
@@ -147,13 +131,7 @@
         "num_edge_types": 10,
         "year_month": "2019-05",
         "start_end_day_range": (8, 18),
-        "train_files": [
-            "graph_8",
-            "graph_9",
-            "graph_10",
-            "graph_11",
-            "graph_12",
-        ],
+        "train_files": ["graph_8", "graph_9", "graph_10", "graph_11", "graph_12"],
         "val_files": ["graph_13"],
         "test_files": ["graph_14", "graph_15", "graph_17"],
         "unused_files": ["graph_16"],
@@ -231,11 +209,7 @@
             "h201/node_h201_0923.csv",
         ],
         "attack_to_time_window": [
-            [
-                "h201/node_h201_0923.csv",
-                "2019-09-23 11:23:00",
-                "2019-09-23 13:25:00",
-            ],
+            ["h201/node_h201_0923.csv", "2019-09-23 11:23:00", "2019-09-23 13:25:00"],
         ],
     },
     "optc_h501": {
@@ -254,11 +228,7 @@
             "h501/node_h501_0924.csv",
         ],
         "attack_to_time_window": [
-            [
-                "h501/node_h501_0924.csv",
-                "2019-09-24 10:28:00",
-                "2019-09-24 15:29:00",
-            ],
+            ["h501/node_h501_0924.csv", "2019-09-24 10:28:00", "2019-09-24 15:29:00"],
         ],
     },
     "optc_h051": {
@@ -277,11 +247,7 @@
             "h051/node_h051_0925.csv",
         ],
         "attack_to_time_window": [
-            [
-                "h051/node_h051_0925.csv",
-                "2019-09-25 10:29:00",
-                "2019-09-25 14:25:00",
-            ],
+            ["h051/node_h051_0925.csv", "2019-09-25 10:29:00", "2019-09-25 14:25:00"],
         ],
     },
 }
@@ -435,8 +401,7 @@ def __init__(self, type, vals: list = None, desc: str = None):
                                 output size matching the downstream objective (e.g. edge type prediction involves predicting 10 edge types, so the output of the decoder should be 10).",
         ),
         "src_dst_projection_coef": Arg(
-            int,
-            desc="Multiplier of input neurons to project src and dst nodes.",
+            int, desc="Multiplier of input neurons to project src and dst nodes."
         ),
     },
     "node_mlp": {
@@ -472,9 +437,7 @@ def __init__(self, type, vals: list = None, desc: str = None):
     # Prediction-based
     "predict_edge_type": {
         "decoder": Arg(
-            str,
-            vals=OR(list(DECODERS_CFG.keys())),
-            desc="Decoder used before computing loss.",
+            str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss."
         ),
         **DECODERS_CFG,
         "balanced_loss": Arg(bool),
@@ -482,9 +445,7 @@ def __init__(self, type, vals: list = None, desc: str = None):
     },
     "predict_node_type": {
         "decoder": Arg(
-            str,
-            vals=OR(list(DECODERS_CFG.keys())),
-            desc="Decoder used before computing loss.",
+            str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss."
         ),
         **DECODERS_CFG,
         "balanced_loss": Arg(bool),
@@ -492,26 +453,20 @@ def __init__(self, type, vals: list = None, desc: str = None):
     "predict_masked_struct": {
         "loss": Arg(str, vals=OR(PRED_LOSSES)),
         "decoder": Arg(
-            str,
-            vals=OR(list(DECODERS_CFG.keys())),
-            desc="Decoder used before computing loss.",
+            str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss."
         ),
         **DECODERS_CFG,
         "balanced_loss": Arg(bool),
     },
     "detect_edge_few_shot": {
         "decoder": Arg(
-            str,
-            vals=OR(list(DECODERS_CFG.keys())),
-            desc="Decoder used before computing loss.",
+            str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss."
         ),
         **DECODERS_CFG,
     },
     "predict_edge_contrastive": {
         "decoder": Arg(
-            str,
-            vals=OR(list(DECODERS_CFG.keys())),
-            desc="Decoder used before computing loss.",
+            str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss."
         ),
         **DECODERS_CFG,
         "inner_product": {
@@ -522,27 +477,21 @@ def __init__(self, type, vals: list = None, desc: str = None):
     "reconstruct_node_features": {
         "loss": Arg(str, vals=OR(RECON_LOSSES)),
         "decoder": Arg(
-            str,
-            vals=OR(list(DECODERS_CFG.keys())),
-            desc="Decoder used before computing loss.",
+            str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss."
         ),
         **DECODERS_CFG,
     },
     "reconstruct_node_embeddings": {
         "loss": Arg(str, vals=OR(RECON_LOSSES)),
         "decoder": Arg(
-            str,
-            vals=OR(list(DECODERS_CFG.keys())),
-            desc="Decoder used before computing loss.",
+            str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss."
         ),
         **DECODERS_CFG,
     },
     "reconstruct_edge_embeddings": {
         "loss": Arg(str, vals=OR(RECON_LOSSES)),
         "decoder": Arg(
-            str,
-            vals=OR(list(DECODERS_CFG.keys())),
-            desc="Decoder used before computing loss.",
+            str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss."
         ),
         **DECODERS_CFG,
     },
@@ -550,9 +499,7 @@ def __init__(self, type, vals: list = None, desc: str = None):
         "loss": Arg(str, vals=OR(RECON_LOSSES)),
         "mask_rate": Arg(float),
         "decoder": Arg(
-            str,
-            vals=OR(list(DECODERS_CFG.keys())),
-            desc="Decoder used before computing loss.",
+            str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss."
         ),
         **DECODERS_CFG,
     },
@@ -567,23 +514,14 @@ def __init__(self, type, vals: list = None, desc: str = None):
     },
 }
 
-THRESHOLD_METHODS = [
-    "max_val_loss",
-    "mean_val_loss",
-    "threatrace",
-    "magic",
-    "flash",
-    "nodlink",
-]
+THRESHOLD_METHODS = ["max_val_loss", "mean_val_loss", "threatrace", "magic", "flash", "nodlink"]
 
 # --- Tasks, subtasks, and argument configurations ---
 TASK_ARGS = {
     "preprocessing": {
         "build_graphs": {
             "used_method": Arg(
-                str,
-                vals=OR(["default", "magic"]),
-                desc="The method to build time window graphs.",
+                str, vals=OR(["default", "magic"]), desc="The method to build time window graphs."
             ),
             "use_all_files": Arg(bool),
             "mimicry_edge_num": Arg(int),
@@ -593,8 +531,7 @@ def __init__(self, type, vals: list = None, desc: str = None):
             ),
             "use_hashed_label": Arg(bool, desc="Whether to hash the textual features."),
             "fuse_edge": Arg(
-                bool,
-                desc="Whether to fuse duplicate sequential edges into a single edge.",
+                bool, desc="Whether to fuse duplicate sequential edges into a single edge."
             ),
             "node_label_features": {
                 "subject": Arg(
@@ -609,9 +546,7 @@ def __init__(self, type, vals: list = None, desc: str = None):
                 ),
                 "netflow": Arg(
                     str,
-                    vals=AND(
-                        ["type", "remote_ip", "remote_port"],
-                    ),
+                    vals=AND(["type", "remote_ip", "remote_port"]),
                     desc="Which features use for netflow nodes. Features will be concatenated.",
                 ),
             },
@@ -643,8 +578,7 @@ def __init__(self, type, vals: list = None, desc: str = None):
                 desc="Size of the text embedding. Arg not used by some featurization methods that do not build embeddings.",
             ),
             "epochs": Arg(
-                int,
-                desc="Epochs to train the embedding method. Arg not used by some methods.",
+                int, desc="Epochs to train the embedding method. Arg not used by some methods."
             ),
             "training_split": Arg(
                 str,
@@ -682,22 +616,13 @@ def __init__(self, type, vals: list = None, desc: str = None):
             ),
             "edge_features": Arg(
                 str,
-                vals=AND(
-                    [
-                        "edge_type",
-                        "edge_type_triplet",
-                        "msg",
-                        "time_encoding",
-                        "none",
-                    ]
-                ),
+                vals=AND(["edge_type", "edge_type_triplet", "msg", "time_encoding", "none"]),
                 desc="Edge features to used during GNN training. `edge_type` refers to the system call type, `edge_type_triplet` \
                                     considers a same edge type as a new type if source or destination node types are different, `msg` is the message vector \
                                     used in the TGN, `time_encoding` encodes temporal order of events with their timestamps in the TGN, `none` uses no features.",
             ),
             "multi_dataset_training": Arg(
-                bool,
-                desc="Whether the GNN should be trained on all datasets in `multi_dataset`.",
+                bool, desc="Whether the GNN should be trained on all datasets in `multi_dataset`."
             ),
             "fix_buggy_graph_reindexer": Arg(
                 bool,
@@ -740,8 +665,7 @@ def __init__(self, type, vals: list = None, desc: str = None):
                 },
                 "tgn_last_neighbor": {
                     "tgn_neighbor_size": Arg(
-                        int,
-                        desc="Number of last neighbors to store for each node.",
+                        int, desc="Number of last neighbors to store for each node."
                     ),
                     "tgn_neighbor_n_hop": Arg(
                         int,
@@ -782,26 +706,19 @@ def __init__(self, type, vals: list = None, desc: str = None):
         },
         "gnn_training": {
             "deterministic": Arg(
-                bool,
-                desc="Whether to force PyTorch to use deterministic algorithms.",
+                bool, desc="Whether to force PyTorch to use deterministic algorithms."
             ),
             "num_epochs": Arg(int),
             "patience": Arg(int),
             "lr": Arg(float),
             "weight_decay": Arg(float),
-            "node_hid_dim": Arg(
-                int,
-                desc="Number of neurons in the middle layers of the encoder.",
-            ),
+            "node_hid_dim": Arg(int, desc="Number of neurons in the middle layers of the encoder."),
             "node_out_dim": Arg(int, desc="Number of neurons in the last layer of the encoder."),
             "grad_accumulation": Arg(
-                int,
-                desc="Number of epochs to gather gradients before backprop.",
+                int, desc="Number of epochs to gather gradients before backprop."
             ),
             "inference_device": Arg(
-                str,
-                vals=OR(["cpu", "cuda"]),
-                desc="Device used during testing.",
+                str, vals=OR(["cpu", "cuda"]), desc="Device used during testing."
             ),
             "used_method": Arg(str, vals=OR(["default"]), desc="Which training pipeline use."),
             "encoder": {
@@ -859,12 +776,10 @@ def __init__(self, type, vals: list = None, desc: str = None):
                     desc="Whether to consider the loss of destination nodes when computing the node-level scores (maximum loss of a node).",
                 ),
                 "use_kmeans": Arg(
-                    bool,
-                    desc="Whether to cluster nodes after thresholding as done in Orthrus",
+                    bool, desc="Whether to cluster nodes after thresholding as done in Orthrus"
                 ),
                 "kmeans_top_K": Arg(
-                    int,
-                    desc="Number of top-score nodes selected before clustering.",
+                    int, desc="Number of top-score nodes selected before clustering."
                 ),
             },
             "tw_evaluation": {
@@ -920,16 +835,7 @@ def __init__(self, type, vals: list = None, desc: str = None):
             ),
             "depimpact": {
                 "used_method": Arg(
-                    str,
-                    vals=OR(
-                        [
-                            "component",
-                            "shortest_path",
-                            "1-hop",
-                            "2-hop",
-                            "3-hop",
-                        ]
-                    ),
+                    str, vals=OR(["component", "shortest_path", "1-hop", "2-hop", "3-hop"])
                 ),
                 "score_method": Arg(str, vals=OR(["degree", "recon_loss", "degree_recon"])),
                 "workers": Arg(int),

From 0a48403808e5713993d9a418ceaf42e30f637bf3 Mon Sep 17 00:00:00 2001
From: tristan <bilot.tristan@hotmail.fr>
Date: Wed, 22 Oct 2025 20:37:10 -0700
Subject: [PATCH 23/33] Revert "Removal of separate calls to set_seed for each
 task. This reverts commit 6c211fbd358501a2deedb877a6291bd6b5e27284.

---
 config/flash.yml                              |  2 ++
 config/kairos.yml                             |  1 +
 config/magic.yml                              |  1 +
 config/nodlink.yml                            |  2 ++
 config/orthrus.yml                            |  2 ++
 config/rcaid.yml                              |  2 ++
 config/threatrace.yml                         |  1 +
 config/tuned_components/tuned_alacarte.yml    |  1 +
 config/tuned_components/tuned_doc2vec.yml     |  1 +
 config/tuned_components/tuned_fasttext.yml    |  1 +
 config/tuned_components/tuned_flash.yml       |  1 +
 config/tuned_components/tuned_word2vec.yml    |  1 +
 docs/scripts/args/args_detection.md           |  1 +
 docs/scripts/args/args_featurization.md       |  1 +
 pidsmaker/config/config.py                    |  2 ++
 pidsmaker/config/pipeline.py                  |  1 -
 pidsmaker/detection/graph_preprocessing.py    |  3 +-
 .../training_methods/inference_loop.py        |  3 ++
 .../training_methods/training_loop.py         |  4 ++-
 pidsmaker/featurization/feat_training.py      |  4 +++
 .../feat_training_alacarte.py                 | 36 +++++++++++++------
 .../feat_training_doc2vec.py                  |  3 +-
 .../feat_training_fasttext.py                 |  4 ++-
 .../feat_training_trw.py                      |  4 ++-
 .../feat_training_word2vec.py                 |  4 ++-
 pidsmaker/preprocessing/transformation.py     |  2 ++
 pidsmaker/utils/utils.py                      | 14 +++++---
 scripts/run.sh                                |  2 +-
 28 files changed, 80 insertions(+), 24 deletions(-)

diff --git a/config/flash.yml b/config/flash.yml
index 303fa479..42c8e340 100644
--- a/config/flash.yml
+++ b/config/flash.yml
@@ -20,6 +20,7 @@ featurization:
     epochs: 10 # 300
     training_split: train
     used_method: flash
+    use_seed: True
     flash:
       min_count: 1
       workers: 15
@@ -40,6 +41,7 @@ detection:
       used_method: none
   gnn_training:
     used_method: default
+    use_seed: True
     deterministic: False
     num_epochs: 12
     patience: 3
diff --git a/config/kairos.yml b/config/kairos.yml
index e1131495..e21386ea 100644
--- a/config/kairos.yml
+++ b/config/kairos.yml
@@ -45,6 +45,7 @@ detection:
       used_method: none
   gnn_training:
     used_method: default
+    use_seed: True
     deterministic: False
     num_epochs: 12
     patience: 3
diff --git a/config/magic.yml b/config/magic.yml
index 6b3fc6c2..d857f350 100644
--- a/config/magic.yml
+++ b/config/magic.yml
@@ -34,6 +34,7 @@ detection:
       used_method: none
   gnn_training:
     used_method: default
+    use_seed: True
     deterministic: False
     num_epochs: 12
     patience: 3
diff --git a/config/nodlink.yml b/config/nodlink.yml
index 52cacd9d..046ee648 100644
--- a/config/nodlink.yml
+++ b/config/nodlink.yml
@@ -18,6 +18,7 @@ featurization:
   feat_training:
     epochs: 100
     emb_dim: 256
+    use_seed: True
     training_split: train
     used_method: fasttext
     fasttext:
@@ -44,6 +45,7 @@ detection:
       used_method: none
   gnn_training:
     used_method: default
+    use_seed: True
     deterministic: False
     num_epochs: 12
     patience: 3
diff --git a/config/orthrus.yml b/config/orthrus.yml
index 3b3d045a..22506f89 100644
--- a/config/orthrus.yml
+++ b/config/orthrus.yml
@@ -21,6 +21,7 @@ featurization:
     emb_dim: 128
     epochs: 50
     training_split: all
+    use_seed: True
     used_method: word2vec
     word2vec:
       alpha: 0.025
@@ -57,6 +58,7 @@ detection:
       used_method: none
   gnn_training:
     used_method: default
+    use_seed: True
     deterministic: False
     num_epochs: 12
     patience: 3
diff --git a/config/rcaid.yml b/config/rcaid.yml
index dff311d1..aff722c0 100644
--- a/config/rcaid.yml
+++ b/config/rcaid.yml
@@ -20,6 +20,7 @@ featurization:
   feat_training:
     epochs: 5
     emb_dim: 128
+    use_seed: True
     training_split: all
     used_method: doc2vec
     doc2vec:
@@ -42,6 +43,7 @@ detection:
       used_method: none
   gnn_training:
     used_method: default
+    use_seed: True
     deterministic: False
     num_epochs: 12
     patience: 3
diff --git a/config/threatrace.yml b/config/threatrace.yml
index 809267ab..6d064fe0 100644
--- a/config/threatrace.yml
+++ b/config/threatrace.yml
@@ -34,6 +34,7 @@ detection:
       used_method: none
   gnn_training:
     used_method: default
+    use_seed: True
     deterministic: False
     num_epochs: 12
     patience: 3
diff --git a/config/tuned_components/tuned_alacarte.yml b/config/tuned_components/tuned_alacarte.yml
index 2048c55a..958e1002 100644
--- a/config/tuned_components/tuned_alacarte.yml
+++ b/config/tuned_components/tuned_alacarte.yml
@@ -2,6 +2,7 @@ featurization:
   feat_training:
     epochs: 10
     emb_dim: 128
+    use_seed: True
     training_split: train
     used_method: alacarte
     alacarte:
diff --git a/config/tuned_components/tuned_doc2vec.yml b/config/tuned_components/tuned_doc2vec.yml
index 93d4d8dd..4824ddec 100644
--- a/config/tuned_components/tuned_doc2vec.yml
+++ b/config/tuned_components/tuned_doc2vec.yml
@@ -2,6 +2,7 @@ featurization:
   feat_training:
     epochs: 10
     emb_dim: 128
+    use_seed: True
     training_split: train
     used_method: doc2vec
     doc2vec:
diff --git a/config/tuned_components/tuned_fasttext.yml b/config/tuned_components/tuned_fasttext.yml
index 0e2a1b32..feb351fe 100644
--- a/config/tuned_components/tuned_fasttext.yml
+++ b/config/tuned_components/tuned_fasttext.yml
@@ -2,6 +2,7 @@ featurization:
   feat_training:
     epochs: 10
     emb_dim: 256
+    use_seed: True
     training_split: train
     used_method: fasttext
     fasttext:
diff --git a/config/tuned_components/tuned_flash.yml b/config/tuned_components/tuned_flash.yml
index 7c058644..01d1b57e 100644
--- a/config/tuned_components/tuned_flash.yml
+++ b/config/tuned_components/tuned_flash.yml
@@ -4,6 +4,7 @@ featurization:
     epochs: 10
     training_split: train
     used_method: flash
+    use_seed: True
     flash:
       min_count: 1
       workers: 10
diff --git a/config/tuned_components/tuned_word2vec.yml b/config/tuned_components/tuned_word2vec.yml
index b8bb8696..5f38ae1f 100644
--- a/config/tuned_components/tuned_word2vec.yml
+++ b/config/tuned_components/tuned_word2vec.yml
@@ -3,6 +3,7 @@ featurization:
     epochs: 10
     emb_dim: 128
     training_split: train
+    use_seed: True
     used_method: word2vec
     word2vec:
       alpha: 0.025
diff --git a/docs/scripts/args/args_detection.md b/docs/scripts/args/args_detection.md
index 676ba1cf..2f888bd0 100644
--- a/docs/scripts/args/args_detection.md
+++ b/docs/scripts/args/args_detection.md
@@ -45,6 +45,7 @@
     </li>
     <li class='bullet'><span class="key">gnn_training</span>
     <ul>
+        <li class='no-bullet'><span class="key-leaf">use_seed</span>: <span class="value">bool</span></li>
         <li class='no-bullet'><span class="key-leaf">deterministic</span>: <span class="value">bool (19)</span></li>
         <li class='no-bullet'><span class="key-leaf">num_epochs</span>: <span class="value">int</span></li>
         <li class='no-bullet'><span class="key-leaf">patience</span>: <span class="value">int</span></li>
diff --git a/docs/scripts/args/args_featurization.md b/docs/scripts/args/args_featurization.md
index 5bd8a81c..c404f12a 100644
--- a/docs/scripts/args/args_featurization.md
+++ b/docs/scripts/args/args_featurization.md
@@ -5,6 +5,7 @@
     <ul>
         <li class='no-bullet'><span class="key-leaf">emb_dim</span>: <span class="value">int (1)</span></li>
         <li class='no-bullet'><span class="key-leaf">epochs</span>: <span class="value">int (2)</span></li>
+        <li class='no-bullet'><span class="key-leaf">use_seed</span>: <span class="value">bool</span></li>
         <li class='no-bullet'><span class="key-leaf">training_split</span>: <span class="value">str (3)</span></li>
         <li class='no-bullet'><span class="key-leaf">multi_dataset_training</span>: <span class="value">bool (4)</span></li>
         <li class='no-bullet'><span class="key-leaf">used_method</span>: <span class="value">str (5)</span></li>
diff --git a/pidsmaker/config/config.py b/pidsmaker/config/config.py
index 8ad8a567..b80d3f85 100644
--- a/pidsmaker/config/config.py
+++ b/pidsmaker/config/config.py
@@ -580,6 +580,7 @@ def __init__(self, type, vals: list = None, desc: str = None):
             "epochs": Arg(
                 int, desc="Epochs to train the embedding method. Arg not used by some methods."
             ),
+            "use_seed": Arg(bool),
             "training_split": Arg(
                 str,
                 vals=OR(["train", "all"]),
@@ -705,6 +706,7 @@ def __init__(self, type, vals: list = None, desc: str = None):
             },
         },
         "gnn_training": {
+            "use_seed": Arg(bool),
             "deterministic": Arg(
                 bool, desc="Whether to force PyTorch to use deterministic algorithms."
             ),
diff --git a/pidsmaker/config/pipeline.py b/pidsmaker/config/pipeline.py
index a4ce1e4f..86e74bc6 100644
--- a/pidsmaker/config/pipeline.py
+++ b/pidsmaker/config/pipeline.py
@@ -146,7 +146,6 @@ def get_runtime_required_args(return_unknown_args=False, args=None):
         "--database_port", default="5432", help="The port number for Postgres (default: 5432)"
     )
     parser.add_argument("--sweep_id", default="", help="ID of a wandb sweep for multi-agent runs")
-    parser.add_argument("--seed", type=int, default=0, help="Random seed for reproducibility")
     parser.add_argument(
         "--artifact_dir", default="./artifacts/", help="Destination folder for generated files"
     )
diff --git a/pidsmaker/detection/graph_preprocessing.py b/pidsmaker/detection/graph_preprocessing.py
index cc31954f..26ba214a 100644
--- a/pidsmaker/detection/graph_preprocessing.py
+++ b/pidsmaker/detection/graph_preprocessing.py
@@ -3,7 +3,7 @@
 import torch
 
 from pidsmaker.utils.data_utils import load_all_datasets
-from pidsmaker.utils.utils import get_device, log, log_start
+from pidsmaker.utils.utils import get_device, log, log_start, set_seed
 
 
 def get_preprocessed_graphs(cfg):
@@ -22,6 +22,7 @@ def get_preprocessed_graphs(cfg):
 
 
 def main(cfg):
+    set_seed(cfg)
     log_start(__file__)
 
     if cfg.detection.graph_preprocessing.save_on_disk:
diff --git a/pidsmaker/detection/training_methods/inference_loop.py b/pidsmaker/detection/training_methods/inference_loop.py
index 1a0ece38..a84328ee 100644
--- a/pidsmaker/detection/training_methods/inference_loop.py
+++ b/pidsmaker/detection/training_methods/inference_loop.py
@@ -15,6 +15,7 @@
     log,
     log_tqdm,
     ns_time_to_datetime_US,
+    set_seed,
 )
 
 
@@ -258,6 +259,8 @@ def test_node_level(
 
 
 def main(cfg, model, val_data, test_data, epoch, split, logging=True):
+    set_seed(cfg)
+
     if split == "all":
         splits = [(val_data, "val"), (test_data, "test")]
     elif split == "val":
diff --git a/pidsmaker/detection/training_methods/training_loop.py b/pidsmaker/detection/training_methods/training_loop.py
index 8464e1b8..06129ea5 100644
--- a/pidsmaker/detection/training_methods/training_loop.py
+++ b/pidsmaker/detection/training_methods/training_loop.py
@@ -12,12 +12,14 @@
     optimizer_factory,
     optimizer_few_shot_factory,
 )
-from pidsmaker.utils.utils import get_device, log, log_start, log_tqdm
+from pidsmaker.utils.utils import get_device, log, log_start, log_tqdm, set_seed
 
 from . import inference_loop
 
 
 def main(cfg):
+    set_seed(cfg)
+
     log_start(__file__)
     device = get_device(cfg)
     use_cuda = device == torch.device("cuda")
diff --git a/pidsmaker/featurization/feat_training.py b/pidsmaker/featurization/feat_training.py
index deb145ab..c10374b8 100644
--- a/pidsmaker/featurization/feat_training.py
+++ b/pidsmaker/featurization/feat_training.py
@@ -1,3 +1,5 @@
+from pidsmaker.utils.utils import set_seed
+
 from .feat_training_methods import (
     build_trw,
     feat_training_alacarte,
@@ -11,6 +13,8 @@
 
 
 def main(cfg):
+    set_seed(cfg)
+
     method = cfg.featurization.feat_training.used_method.strip()
     if method == "alacarte":
         build_random_walks.main(cfg)
diff --git a/pidsmaker/featurization/feat_training_methods/feat_training_alacarte.py b/pidsmaker/featurization/feat_training_methods/feat_training_alacarte.py
index b427b729..ad8b4d69 100644
--- a/pidsmaker/featurization/feat_training_methods/feat_training_alacarte.py
+++ b/pidsmaker/featurization/feat_training_methods/feat_training_alacarte.py
@@ -444,6 +444,8 @@ def feat_training_for_one_split(
     num_workers = cfg.featurization.feat_training.alacarte.num_workers
     compute_loss = cfg.featurization.feat_training.alacarte.compute_loss
     add_paths = cfg.featurization.feat_training.alacarte.add_paths
+    use_seed = cfg.featurization.feat_training.use_seed
+    SEED = 0
 
     log_dir = out_dir
 
@@ -483,17 +485,29 @@ def feat_training_for_one_split(
     # Training using Word2Vec if needed
     # ===-----------------------------------------------------------------------===
     if model_input is None:
-        model = Word2Vec(
-            paths,
-            vector_size=emb_dim,
-            window=window_size,
-            min_count=min_count,
-            sg=use_skip_gram,
-            workers=num_workers,
-            epochs=epochs,
-            compute_loss=compute_loss,
-            seed=cfg.seed,
-        )
+        if use_seed:
+            model = Word2Vec(
+                paths,
+                vector_size=emb_dim,
+                window=window_size,
+                min_count=min_count,
+                sg=use_skip_gram,
+                workers=num_workers,
+                epochs=epochs,
+                compute_loss=compute_loss,
+                seed=SEED,
+            )
+        else:
+            model = Word2Vec(
+                paths,
+                vector_size=emb_dim,
+                window=window_size,
+                min_count=min_count,
+                sg=use_skip_gram,
+                workers=num_workers,
+                epochs=epochs,
+                compute_loss=compute_loss,
+            )
     else:
         log("Loading existing model from: {}".format(model_input))
         model = Word2Vec.load(model_input)
diff --git a/pidsmaker/featurization/feat_training_methods/feat_training_doc2vec.py b/pidsmaker/featurization/feat_training_methods/feat_training_doc2vec.py
index 3fb67883..2d0b35c3 100644
--- a/pidsmaker/featurization/feat_training_methods/feat_training_doc2vec.py
+++ b/pidsmaker/featurization/feat_training_methods/feat_training_doc2vec.py
@@ -18,8 +18,9 @@ def doc2vec(
     alpha: float,
     dm: int = 1,
 ):
+    SEED = 0
     model = Doc2Vec(
-        vector_size=emb_dim, alpha=alpha, min_count=1, dm=dm, compute_loss=True, seed=cfg.seed
+        vector_size=emb_dim, alpha=alpha, min_count=1, dm=dm, compute_loss=True, seed=SEED
     )
     model.build_vocab(tagged_data)
 
diff --git a/pidsmaker/featurization/feat_training_methods/feat_training_fasttext.py b/pidsmaker/featurization/feat_training_methods/feat_training_fasttext.py
index b368bd98..07575efa 100644
--- a/pidsmaker/featurization/feat_training_methods/feat_training_fasttext.py
+++ b/pidsmaker/featurization/feat_training_methods/feat_training_fasttext.py
@@ -23,6 +23,8 @@ def train_fasttext(corpus, cfg):
     min_count = cfg.featurization.feat_training.fasttext.min_count
     num_workers = cfg.featurization.feat_training.fasttext.num_workers
     negative = cfg.featurization.feat_training.fasttext.negative
+    use_seed = cfg.featurization.feat_training.use_seed
+    SEED = 0
 
     use_pretrained_fb_model = cfg.featurization.feat_training.fasttext.use_pretrained_fb_model
 
@@ -42,7 +44,7 @@ def train_fasttext(corpus, cfg):
             alpha=alpha,
             window=window_size,
             negative=negative,
-            seed=cfg.seed,
+            seed=SEED,
         )
 
     model.build_vocab(corpus, update=use_pretrained_fb_model)
diff --git a/pidsmaker/featurization/feat_training_methods/feat_training_trw.py b/pidsmaker/featurization/feat_training_methods/feat_training_trw.py
index e5a45f6f..d8ba7dc0 100644
--- a/pidsmaker/featurization/feat_training_methods/feat_training_trw.py
+++ b/pidsmaker/featurization/feat_training_methods/feat_training_trw.py
@@ -41,6 +41,8 @@ def train_word2vec(corpus, model_save_path, cfg):
     epochs = cfg.featurization.feat_training.epochs
     compute_loss = cfg.featurization.feat_training.temporal_rw.compute_loss
     negative = cfg.featurization.feat_training.temporal_rw.negative
+    use_seed = cfg.featurization.feat_training.use_seed
+    SEED = 0
 
     model = Word2Vec(
         corpus,
@@ -52,7 +54,7 @@ def train_word2vec(corpus, model_save_path, cfg):
         epochs=1,
         compute_loss=compute_loss,
         negative=negative,
-        seed=cfg.seed,
+        seed=SEED,
     )
 
     epoch_loss = model.get_latest_training_loss()
diff --git a/pidsmaker/featurization/feat_training_methods/feat_training_word2vec.py b/pidsmaker/featurization/feat_training_methods/feat_training_word2vec.py
index 08c7a751..42216b79 100644
--- a/pidsmaker/featurization/feat_training_methods/feat_training_word2vec.py
+++ b/pidsmaker/featurization/feat_training_methods/feat_training_word2vec.py
@@ -16,6 +16,8 @@ def train_word2vec(corpus, cfg, model_save_path):
     epochs = cfg.featurization.feat_training.epochs
     compute_loss = cfg.featurization.feat_training.word2vec.compute_loss
     negative = cfg.featurization.feat_training.word2vec.negative
+    use_seed = cfg.featurization.feat_training.use_seed
+    SEED = 0
 
     model = Word2Vec(
         corpus,
@@ -28,7 +30,7 @@ def train_word2vec(corpus, cfg, model_save_path):
         epochs=1,
         compute_loss=compute_loss,
         negative=negative,
-        seed=cfg.seed,
+        seed=SEED,
     )
 
     epoch_loss = model.get_latest_training_loss()
diff --git a/pidsmaker/preprocessing/transformation.py b/pidsmaker/preprocessing/transformation.py
index ccf2bb7d..a6d7d43d 100644
--- a/pidsmaker/preprocessing/transformation.py
+++ b/pidsmaker/preprocessing/transformation.py
@@ -16,6 +16,7 @@
     load_graphs_for_days,
     log_start,
     log_tqdm,
+    set_seed,
 )
 
 
@@ -109,6 +110,7 @@ def main_from_config(cfg):
 
 
 def main(cfg):
+    set_seed(cfg)
     log_start(__file__)
 
     multi_datasets = get_multi_datasets(cfg)
diff --git a/pidsmaker/utils/utils.py b/pidsmaker/utils/utils.py
index ace84aad..c5f1ec3e 100644
--- a/pidsmaker/utils/utils.py
+++ b/pidsmaker/utils/utils.py
@@ -653,11 +653,15 @@ def log_helper(label, dataset):
 
 
 def set_seed(cfg):
-    seed = cfg.seed
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.backends.cudnn.benchmark = False
+    if cfg.detection.gnn_training.use_seed:
+        seed = 0
+        random.seed(seed)
+        np.random.seed(seed)
+
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
 
     if cfg.detection.gnn_training.deterministic:
         torch.use_deterministic_algorithms(True, warn_only=True)
diff --git a/scripts/run.sh b/scripts/run.sh
index 4e3be119..a9a8c5af 100755
--- a/scripts/run.sh
+++ b/scripts/run.sh
@@ -15,4 +15,4 @@ for arg in "$@"; do
 done
 
 # Execute the Python script with the passed arguments
-PYTHONHASHSEED=0 nohup python ../pidsmaker/main.py $args --wandb &
\ No newline at end of file
+PYTHONHASHSEED=0 nohup python ../pidsmaker/main.py $args --wandb &

From 9f7d8a6c5a91df5e00cef99418f6e4b98c41d86b Mon Sep 17 00:00:00 2001
From: tristan <bilot.tristan@hotmail.fr>
Date: Wed, 22 Oct 2025 20:50:17 -0700
Subject: [PATCH 24/33] revert ValueError or functional tests fail

---
 pidsmaker/config/pipeline.py                              | 3 +--
 .../detection/evaluation_methods/evaluation_utils.py      | 8 ++++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/pidsmaker/config/pipeline.py b/pidsmaker/config/pipeline.py
index 86e74bc6..1fd42d71 100644
--- a/pidsmaker/config/pipeline.py
+++ b/pidsmaker/config/pipeline.py
@@ -456,8 +456,7 @@ def get_yml_cfg(args):
 
     # Inits with default configurations
     cfg = get_default_cfg(args)
-    # Set seed for reproducibility
-    cfg.seed = args.seed
+    
     # Checks that all configurations are valid and merge yml file to cfg
     yml_file = get_yml_file(args.model)
     merge_cfg_and_check_syntax(cfg, yml_file)
diff --git a/pidsmaker/detection/evaluation_methods/evaluation_utils.py b/pidsmaker/detection/evaluation_methods/evaluation_utils.py
index 947a0f86..c9683d35 100644
--- a/pidsmaker/detection/evaluation_methods/evaluation_utils.py
+++ b/pidsmaker/detection/evaluation_methods/evaluation_utils.py
@@ -36,8 +36,12 @@
 
 
 def classifier_evaluation(y_test, y_test_pred, scores):
-    if not sum(y_test) > 0:
-        raise ValueError("Cannot evaluate: no positive labels in test set")
+    labels_exist = sum(y_test) > 0
+    if labels_exist:
+        tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
+    else:
+        log("WARNING: Computing confusion matrix failed.")
+        tn, fp, fn, tp = 1, 1, 1, 1  # only to not break tests
     tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
 
     eps = 1e-12

From 7c825fe0731411ba7e6c1587c5cebe4821a5e83c Mon Sep 17 00:00:00 2001
From: tristan <bilot.tristan@hotmail.fr>
Date: Wed, 22 Oct 2025 20:52:33 -0700
Subject: [PATCH 25/33] minor revert to still support logging args when using
 sweeps

---
 pidsmaker/main.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pidsmaker/main.py b/pidsmaker/main.py
index 4e961710..1d5e9273 100644
--- a/pidsmaker/main.py
+++ b/pidsmaker/main.py
@@ -276,19 +276,19 @@ def run_pipeline_from_sweep(cfg):
     )
     tags = args.tags.split(",") if args.tags != "" else [args.model]
 
-    cfg = get_yml_cfg(args)
-
     wandb.init(
         mode=("online" if (args.wandb and args.tuning_mode == "none") else "disabled"),
         project=args.project,
         name=exp_name,
         tags=tags,
-        config=clean_cfg_for_log(cfg),
     )
 
     if len(unknown_args) > 0:
         raise argparse.ArgumentTypeError(f"Unknown args {unknown_args}")
 
+    cfg = get_yml_cfg(args)
+    wandb.config.update(clean_cfg_for_log(cfg))
+
     main(cfg, project=args.project, exp=exp_name, sweep_id=args.sweep_id)
 
     wandb.finish()

From b42bc608763d5c30e6fc3b616d8c3e97114d9161 Mon Sep 17 00:00:00 2001
From: tristan <bilot.tristan@hotmail.fr>
Date: Thu, 23 Oct 2025 04:16:29 +0000
Subject: [PATCH 26/33] fix tests

---
 pidsmaker/main.py        | 11 -----------
 pidsmaker/utils/utils.py |  3 +--
 tests/test_framework.py  |  3 +--
 3 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/pidsmaker/main.py b/pidsmaker/main.py
index 1d5e9273..1ebcf4e5 100644
--- a/pidsmaker/main.py
+++ b/pidsmaker/main.py
@@ -5,9 +5,7 @@
 import time
 from collections import defaultdict
 
-import networkx as nx
 import torch
-import torch_geometric
 import wandb
 
 from pidsmaker.config import (
@@ -45,17 +43,8 @@
 from pidsmaker.triage import (
     tracing,
 )
-from pidsmaker.utils.data_utils import CollatableTemporalData
 from pidsmaker.utils.utils import log, remove_underscore_keys, set_seed
 
-torch.serialization.add_safe_globals(
-    [
-        nx.classes.multidigraph.MultiDiGraph,
-        CollatableTemporalData,
-        torch_geometric.data.storage.GlobalStorage,
-    ]
-)
-
 
 def get_task_to_module(cfg):
     return {
diff --git a/pidsmaker/utils/utils.py b/pidsmaker/utils/utils.py
index c5f1ec3e..529fb546 100644
--- a/pidsmaker/utils/utils.py
+++ b/pidsmaker/utils/utils.py
@@ -18,8 +18,7 @@
 from nltk.tokenize import word_tokenize
 from tqdm import tqdm
 
-nltk.download("punkt_tab", quiet=True, download_dir="./nltk_data")
-nltk.data.path.append("./nltk_data")
+nltk.download("punkt", quiet=True)
 
 from pidsmaker.config import update_cfg_for_multi_dataset
 
diff --git a/tests/test_framework.py b/tests/test_framework.py
index 9a9e8a78..167af7e1 100644
--- a/tests/test_framework.py
+++ b/tests/test_framework.py
@@ -7,13 +7,12 @@
 
 from pidsmaker import main
 from pidsmaker.config import (
-    DEFAULT_ROOT_ARTIFACT_DIR,
     ENCODERS_CFG,
     get_runtime_required_args,
     get_yml_cfg,
 )
 
-TESTS_ARTIFACT_DIR = os.path.join(DEFAULT_ROOT_ARTIFACT_DIR, "tests/")
+TESTS_ARTIFACT_DIR = os.path.join("./artifacts/", "tests/")
 
 
 def prepare_cfg(

From d806a0a364ac78e66e29cae9330219cd9d232bf0 Mon Sep 17 00:00:00 2001
From: tristan <bilot.tristan@hotmail.fr>
Date: Thu, 23 Oct 2025 04:21:11 +0000
Subject: [PATCH 27/33] revert artifact_dir to /home/artifacts to map to Docker
 volume => ${ARTIFACTS_DIR:-/artifacts}:/home/artifacts

---
 pidsmaker/config/pipeline.py | 2 +-
 tests/test_framework.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pidsmaker/config/pipeline.py b/pidsmaker/config/pipeline.py
index 1fd42d71..c17785ab 100644
--- a/pidsmaker/config/pipeline.py
+++ b/pidsmaker/config/pipeline.py
@@ -147,7 +147,7 @@ def get_runtime_required_args(return_unknown_args=False, args=None):
     )
     parser.add_argument("--sweep_id", default="", help="ID of a wandb sweep for multi-agent runs")
     parser.add_argument(
-        "--artifact_dir", default="./artifacts/", help="Destination folder for generated files"
+        "--artifact_dir", default="/home/artifacts/", help="Destination folder for generated files"
     )
     parser.add_argument(
         "--test_mode",
diff --git a/tests/test_framework.py b/tests/test_framework.py
index 167af7e1..e81645a1 100644
--- a/tests/test_framework.py
+++ b/tests/test_framework.py
@@ -12,7 +12,7 @@
     get_yml_cfg,
 )
 
-TESTS_ARTIFACT_DIR = os.path.join("./artifacts/", "tests/")
+TESTS_ARTIFACT_DIR = os.path.join("/home/artifacts/", "tests/")
 
 
 def prepare_cfg(

From 156ac9c172b8edb1ef1d205baddc265045bb6e76 Mon Sep 17 00:00:00 2001
From: tristan <bilot.tristan@hotmail.fr>
Date: Thu, 23 Oct 2025 04:23:31 +0000
Subject: [PATCH 28/33] hotfix in the Docker install (not related directly to
 this PR but need to be merged anyway)

---
 Dockerfile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 226bf401..82af74ea 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -43,8 +43,7 @@ RUN conda install -y psycopg2 tqdm && \
                 wandb==0.16.6 chardet==5.2.0 nltk==3.8.1 igraph==0.11.5 \
                 cairocffi==1.7.0 wget==3.2
 
-RUN conda install -y pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 \
-                pytorch-cuda=11.7 -c pytorch -c nvidia
+RUN pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu117
 
 RUN pip install torch_geometric==2.5.3 --no-cache-dir && \
     pip install pyg_lib==0.2.0 torch_scatter==2.1.1 torch_sparse==0.6.17 \

From b6ee3f019b54d6bcfd0c8fbe7fca65a84ec511f5 Mon Sep 17 00:00:00 2001
From: tristan <bilot.tristan@hotmail.fr>
Date: Thu, 23 Oct 2025 04:34:11 +0000
Subject: [PATCH 29/33] fix tests finally

---
 pidsmaker/detection/evaluation_methods/evaluation_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pidsmaker/detection/evaluation_methods/evaluation_utils.py b/pidsmaker/detection/evaluation_methods/evaluation_utils.py
index c9683d35..a8e7cc86 100644
--- a/pidsmaker/detection/evaluation_methods/evaluation_utils.py
+++ b/pidsmaker/detection/evaluation_methods/evaluation_utils.py
@@ -42,7 +42,6 @@ def classifier_evaluation(y_test, y_test_pred, scores):
     else:
         log("WARNING: Computing confusion matrix failed.")
         tn, fp, fn, tp = 1, 1, 1, 1  # only to not break tests
-    tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
 
     eps = 1e-12
     fpr = fp / (fp + tn + eps)

From 3f685ac3674e441da5a6372ec11f237807d5dac4 Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Thu, 23 Oct 2025 15:46:35 +0200
Subject: [PATCH 30/33] Simplify environment.yaml and make it perfectly match
 the docker environment

---
 scripts/environment.yaml | 225 ++++++++-------------------------------
 1 file changed, 43 insertions(+), 182 deletions(-)

diff --git a/scripts/environment.yaml b/scripts/environment.yaml
index 5563e2ce..83acfd12 100644
--- a/scripts/environment.yaml
+++ b/scripts/environment.yaml
@@ -1,185 +1,46 @@
 name: pids
 channels:
-  - conda-forge
+- conda-forge
 dependencies:
-  - _libgcc_mutex=0.1=conda_forge
-  - _openmp_mutex=4.5=3_kmp_llvm
-  - bzip2=1.0.8=h4bc722e_7
-  - ca-certificates=2025.6.15=hbd8a1cb_0
-  - certifi=2025.6.15=pyhd8ed1ab_0
-  - colorama=0.4.6=pyhd8ed1ab_1
-  - cyrus-sasl=2.1.28=hd9c7081_0
-  - icu=75.1=he02047a_0
-  - keyutils=1.6.1=h166bdaf_0
-  - krb5=1.21.3=h659f571_0
-  - lcms2=2.17=h717163a_0
-  - ld_impl_linux-64=2.43=h1423503_5
-  - lerc=4.0.0=h0aef613_1
-  - libdeflate=1.24=h86f0d12_0
-  - libedit=3.1.20250104=pl5321h7949ede_0
-  - libexpat=2.7.0=h5888daf_0
-  - libffi=3.4.6=h2dba641_1
-  - libfreetype=2.13.3=ha770c72_1
-  - libfreetype6=2.13.3=h48d6fc4_1
-  - libgcc=15.1.0=h767d61c_3
-  - libgcc-ng=15.1.0=h69a702a_3
-  - libgomp=15.1.0=h767d61c_3
-  - libjpeg-turbo=3.1.0=hb9d3cd8_0
-  - liblzma=5.8.1=hb9d3cd8_2
-  - libnsl=2.0.1=hb9d3cd8_1
-  - libntlm=1.8=hb9d3cd8_0
-  - libpng=1.6.49=h943b412_0
-  - libpq=17.5=h27ae623_0
-  - libsqlite=3.50.1=h6cd9bfd_7
-  - libstdcxx=15.1.0=h8f9b012_3
-  - libstdcxx-ng=15.1.0=h4852527_3
-  - libtiff=4.7.0=hf01ce69_5
-  - libuuid=2.38.1=h0b41bf4_0
-  - libwebp-base=1.5.0=h851e524_0
-  - libxcb=1.17.0=h8a09558_0
-  - libxcrypt=4.4.36=hd590300_1
-  - libzlib=1.3.1=hb9d3cd8_2
-  - llvm-openmp=20.1.7=h024ca30_0
-  - ncurses=6.5=h2d0b736_3
-  - openjpeg=2.5.3=h5fbd93e_0
-  - openldap=2.6.10=he970967_0
-  - openssl=3.5.0=h7b32b05_1
-  - pillow=11.2.1=py39h15c0740_0
-  - pip=25.1.1=pyh8b19718_0
-  - psycopg2=2.9.10=py39h2bc273e_1
-  - pthread-stubs=0.4=hb9d3cd8_1002
-  - python=3.9.23=hc30ae73_0_cpython
-  - python_abi=3.9=7_cp39
-  - readline=8.2=h8c095d6_2
-  - tk=8.6.13=noxft_hd72426e_102
-  - tqdm=4.67.1=pyhd8ed1ab_1
-  - wheel=0.45.1=pyhd8ed1ab_1
-  - xorg-libxau=1.0.12=hb9d3cd8_0
-  - xorg-libxdmcp=1.1.5=hb9d3cd8_0
-  - zstd=1.5.7=hb8e6e7a_2
-  - pip:
-      - aiohappyeyeballs==2.6.1
-      - aiohttp==3.12.13
-      - aiosignal==1.3.2
-      - appdirs==1.4.4
-      - async-timeout==5.0.1
-      - attrs==25.3.0
-      - babel==2.17.0
-      - backrefs==5.9
-      - beautifulsoup4==4.13.4
-      - cairocffi==1.7.0
-      - cfgv==3.4.0
-      - chardet==5.2.0
-      - charset-normalizer==3.4.2
-      - click==8.1.8
-      - contourpy==1.3.0
-      - coverage==7.9.1
-      - cycler==0.12.1
-      - distlib==0.3.9
-      - docker-pycreds==0.4.0
-      - exceptiongroup==1.3.0
-      - filelock==3.18.0
-      - fonttools==4.58.4
-      - frozenlist==1.7.0
-      - fsspec==2025.5.1
-      - gdown==5.2.0
-      - gensim==4.3.1
-      - ghp-import==2.1.0
-      - gitdb==4.0.12
-      - gitpython==3.1.44
-      - graphviz==0.20.1
-      - h5py==3.14.0
-      - identify==2.6.12
-      - idna==3.10
-      - igraph==0.11.5
-      - importlib-metadata==8.7.0
-      - importlib-resources==6.5.2
-      - iniconfig==2.1.0
-      - jinja2==3.1.6
-      - joblib==1.5.1
-      - kiwisolver==1.4.7
-      - markdown==3.8.2
-      - markupsafe==3.0.2
-      - matplotlib==3.8.4
-      - mergedeep==1.3.4
-      - mkdocs==1.6.1
-      - mkdocs-get-deps==0.2.0
-      - mkdocs-glightbox==0.4.0
-      - mkdocs-material==9.6.12
-      - mkdocs-material-extensions==1.3.1
-      - mpmath==1.3.0
-      - multidict==6.5.1
-      - networkx==2.8.7
-      - nltk==3.8.1
-      - nodeenv==1.9.1
-      - numpy==1.26.4
-      - nvidia-cublas-cu12==12.6.4.1
-      - nvidia-cuda-cupti-cu12==12.6.80
-      - nvidia-cuda-nvrtc-cu12==12.6.77
-      - nvidia-cuda-runtime-cu12==12.6.77
-      - nvidia-cudnn-cu12==9.5.1.17
-      - nvidia-cufft-cu12==11.3.0.4
-      - nvidia-cufile-cu12==1.11.1.6
-      - nvidia-curand-cu12==10.3.7.77
-      - nvidia-cusolver-cu12==11.7.1.2
-      - nvidia-cusparse-cu12==12.5.4.2
-      - nvidia-cusparselt-cu12==0.6.3
-      - nvidia-nccl-cu12==2.26.2
-      - nvidia-nvjitlink-cu12==12.6.85
-      - nvidia-nvtx-cu12==12.6.77
-      - packaging==25.0
-      - paginate==0.5.7
-      - pandas==2.2.2
-      - pathspec==0.12.1
-      - platformdirs==4.3.8
-      - pluggy==1.6.0
-      - pre-commit==4.2.0
-      - propcache==0.3.2
-      - protobuf==4.25.8
-      - psutil==7.0.0
-      - pyg-lib==0.4.0+pt27cu126
-      - pygments==2.19.2
-      - pymdown-extensions==10.16
-      - pyparsing==3.2.3
-      - pytest==8.3.5
-      - pytest-cov==6.1.1
-      - python-dateutil==2.9.0.post0
-      - pytz==2024.1
-      - pyyaml==6.0.2
-      - pyyaml-env-tag==1.1
-      - regex==2024.11.6
-      - requests==2.32.4
-      - scikit-learn==1.2.0
-      - scipy==1.10.1
-      - sentry-sdk==2.31.0
-      - setproctitle==1.3.6
-      - setuptools==61.0.0
-      - six==1.17.0
-      - smart-open==7.1.0
-      - smmap==5.0.2
-      - soupsieve==2.7
-      - sympy==1.14.0
-      - texttable==1.7.0
-      - threadpoolctl==3.6.0
-      - tomli==2.2.1
-      - torch==2.7.1
-      - torch-cluster==1.6.3+pt27cu126
-      - torch-geometric==2.5.3
-      - torch-scatter==2.1.2+pt27cu126
-      - torch-sparse==0.6.18+pt27cu126
-      - torch-spline-conv==1.2.2+pt27cu126
-      - torchaudio==2.7.1
-      - torchvision==0.22.1
-      - triton==3.3.1
-      - typing-extensions==4.14.0
-      - tzdata==2025.2
-      - urllib3==2.5.0
-      - virtualenv==20.31.2
-      - wandb==0.16.6
-      - watchdog==6.0.0
-      - wget==3.2
-      - wrapt==1.17.2
-      - xxhash==3.2.0
-      - yacs==0.1.8
-      - yarl==1.20.1
-      - zipp==3.23.0
+- pip=25.1.1
+- python=3.9.23
+- python_abi=3.9
+- psycopg2
+- tqdm
+- pip:
+  - --extra-index-url https://download.pytorch.org/whl/cu117
+  - -f https://data.pyg.org/whl/torch-1.13.0+cu117.html
+  - torch==1.13.1+cu117
+  - torchvision==0.14.1+cu117
+  - torchaudio==0.13.1
+  - scikit-learn==1.2.0
+  - networkx==2.8.7
+  - xxhash==3.2.0
+  - graphviz==0.20.1
+  - psutil
+  - scipy==1.10.1
+  - matplotlib==3.8.4
+  - wandb==0.16.6
+  - chardet==5.2.0
+  - nltk==3.8.1
+  - igraph==0.11.5
+  - cairocffi==1.7.0
+  - wget==3.2
+  - torch_geometric==2.5.3
+  - pyg_lib==0.2.0
+  - torch_scatter==2.1.1
+  - torch_sparse==0.6.17
+  - torch_cluster==1.6.1
+  - torch_spline_conv==1.2.2
+  - gensim==4.3.1
+  - pytz==2024.1
+  - pandas==2.2.2
+  - yacs==0.1.8
+  - numpy==1.26.4
+  - gdown==5.2.0
+  - pytest==8.3.5
+  - pytest-cov==6.1.1
+  - pre-commit==4.2.0
+  - setuptools==61.0
+  - mkdocs-material==9.6.12
+  - mkdocs-glightbox==0.4.0

From 0ef0f8078b738a79bedb4f544a575f46805dc417 Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Thu, 23 Oct 2025 16:27:50 +0200
Subject: [PATCH 31/33] Add support for Apptainer (new name for Singularity) as
 well

---
 .gitignore                 |  9 +++--
 scripts/Makefile           | 18 ++++++---
 scripts/postgres-start.sh  | 75 ++++++++++++++++++++++----------------
 scripts/postgres-status.sh | 24 ++++++++----
 scripts/postgres-stop.sh   | 24 ++++++++----
 5 files changed, 95 insertions(+), 55 deletions(-)

diff --git a/.gitignore b/.gitignore
index 375cde6d..0832ad6b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -177,9 +177,10 @@ postgres_lock/
 *.def
 
 # Postgres directories
-postgres_config/
-postgres_run/
-postgres_log/
+postgres_config
+postgres_run
+postgres_log
+postgres_data
 
 # tokenizer data
-nltk_data/
+nltk_data
diff --git a/scripts/Makefile b/scripts/Makefile
index b395519b..387a3948 100644
--- a/scripts/Makefile
+++ b/scripts/Makefile
@@ -1,4 +1,4 @@
-# Makefile for Singularity PostgreSQL management
+# Makefile for Singularity/Apptainer PostgreSQL management
 
 .PHONY: up down status load-dumps full-setup logs clean help
 
@@ -25,11 +25,18 @@ reset: clean up
 
 app-build:
 	@echo "Building PIDSMaker container..."
-	@singularity build pidsmaker.sif pidsmaker.def || echo "Build failed - check if you have fakeroot access"
+	@if command -v apptainer &> /dev/null; then \
+		apptainer build pidsmaker.sif pidsmaker.def || echo "Build failed - check if you have fakeroot access"; \
+	elif command -v singularity &> /dev/null; then \
+		singularity build pidsmaker.sif pidsmaker.def || echo "Build failed - check if you have fakeroot access"; \
+	else \
+		echo "ERROR: Neither apptainer nor singularity found"; exit 1; \
+	fi
 
 app-run: up
 	@echo "Running PIDSMaker application..."
-	@singularity run --nv \
+	@CONTAINER_CMD=$$(command -v apptainer &> /dev/null && echo "apptainer" || echo "singularity"); \
+	$$CONTAINER_CMD run --nv \
 		--env DB_HOST=localhost \
 		--env DOCKER_PORT=5432 \
 		--env DB_USER=postgres \
@@ -39,9 +46,10 @@ app-run: up
 
 load-dumps: up
 	@echo "Loading database dumps from inside container..."
-	@if [ -f "./load_dumps.sh" ]; then \
+	@CONTAINER_CMD=$$(command -v apptainer &> /dev/null && echo "apptainer" || echo "singularity"); \
+	if [ -f "./load_dumps.sh" ]; then \
 		echo "Found load_dumps.sh, executing inside container..."; \
-		singularity exec instance://postgres_instance /scripts/load_dumps.sh; \
+		$$CONTAINER_CMD exec instance://postgres_instance /scripts/load_dumps.sh; \
 	    else \
 		echo "Error: ./load_dumps.sh not found"; \
 		exit 1; \
diff --git a/scripts/postgres-start.sh b/scripts/postgres-start.sh
index 7d00cd8e..03d75cbf 100755
--- a/scripts/postgres-start.sh
+++ b/scripts/postgres-start.sh
@@ -1,9 +1,26 @@
 #!/bin/bash
 
-# PostgreSQL startup script for Singularity
+# PostgreSQL startup script for Singularity/Apptainer
 
 set -e
 
+# Detect which container runtime is available
+if command -v apptainer &> /dev/null; then
+    CONTAINER_CMD="apptainer"
+    export APPTAINER_TMPDIR="${TMPDIR:-/tmp}/apptainer-${USER}"
+    export APPTAINER_CACHEDIR="${HOME}/.apptainer/cache"
+    export APPTAINER_SESSIONDIR="${TMPDIR:-/tmp}/apptainer-sessions-${USER}"
+    mkdir -p "$APPTAINER_TMPDIR" "$APPTAINER_CACHEDIR" "$APPTAINER_SESSIONDIR"
+elif command -v singularity &> /dev/null; then
+    CONTAINER_CMD="singularity"
+    export SINGULARITY_TMPDIR="${TMPDIR:-/tmp}/singularity-${USER}"
+    export SINGULARITY_CACHEDIR="${HOME}/.singularity/cache"
+    mkdir -p "$SINGULARITY_TMPDIR" "$SINGULARITY_CACHEDIR"
+else
+    echo "ERROR: Neither apptainer nor singularity found in PATH"
+    exit 1
+fi
+
 # Configuration
 POSTGRES_IMAGE="postgres.sif"
 POSTGRES_INSTANCE="postgres_instance"
@@ -17,12 +34,12 @@ GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 NC='\033[0m' # No Color
 
-echo -e "${YELLOW}Starting PostgreSQL with Singularity...${NC}"
+echo -e "${YELLOW}Starting PostgreSQL with ${CONTAINER_CMD}...${NC}"
 
 # Check if postgres.sif exists
 if [ ! -f "$POSTGRES_IMAGE" ]; then
     echo -e "${YELLOW}PostgreSQL image not found. Pulling from Docker Hub...${NC}"
-    singularity pull $POSTGRES_IMAGE docker://postgres:17
+    $CONTAINER_CMD pull $POSTGRES_IMAGE docker://postgres:17
 fi
 
 # Create necessary directories
@@ -37,48 +54,42 @@ if [ ! -d "$INPUT_DIR" ]; then
 fi
 
 # Check if instance already exists
-if singularity instance list | grep -q "$POSTGRES_INSTANCE"; then
+if $CONTAINER_CMD instance list | grep -q "$POSTGRES_INSTANCE"; then
     echo -e "${YELLOW}PostgreSQL instance $POSTGRES_INSTANCE already exists${NC}"
     # Check if it's responsive
-    if singularity exec instance://$POSTGRES_INSTANCE pg_isready -h localhost -U postgres > /dev/null 2>&1; then
+    if $CONTAINER_CMD exec instance://$POSTGRES_INSTANCE pg_isready -h localhost -U postgres > /dev/null 2>&1; then
         echo -e "${GREEN}PostgreSQL instance is already running and responsive${NC}"
         exit 0
     else
         echo -e "${YELLOW}Instance exists but not responsive, stopping it...${NC}"
-        singularity instance stop $POSTGRES_INSTANCE
+        $CONTAINER_CMD instance stop $POSTGRES_INSTANCE
         sleep 2
     fi
 fi
 
 # Check if any other postgres processes are running
-if pgrep -f "singularity.*postgres" > /dev/null; then
+if pgrep -f "${CONTAINER_CMD}.*postgres" > /dev/null; then
     echo -e "${YELLOW}Other PostgreSQL processes detected, cleaning up...${NC}"
-    pkill -f "singularity.*postgres" || true
+    pkill -f "${CONTAINER_CMD}.*postgres" || true
     sleep 2
 fi
 
-# Set environment variables
-export SINGULARITYENV_POSTGRES_PASSWORD=postgres
-export SINGULARITYENV_POSTGRES_USER=postgres
-export SINGULARITYENV_POSTGRES_DB=postgres
+# Set environment variables (works for both singularity and apptainer)
+if [ "$CONTAINER_CMD" = "apptainer" ]; then
+    export APPTAINERENV_POSTGRES_PASSWORD=postgres
+    export APPTAINERENV_POSTGRES_USER=postgres
+    export APPTAINERENV_POSTGRES_DB=postgres
+else
+    export SINGULARITYENV_POSTGRES_PASSWORD=postgres
+    export SINGULARITYENV_POSTGRES_USER=postgres
+    export SINGULARITYENV_POSTGRES_DB=postgres
+fi
 
 # Prepare bind mounts - only bind if files/directories exist
 BIND_MOUNTS="--bind $DATA_DIR:/var/lib/postgresql/data"
 BIND_MOUNTS="$BIND_MOUNTS --bind $RUN_DIR:/var/run/postgresql"
 BIND_MOUNTS="$BIND_MOUNTS --bind $LOG_DIR:/var/log"
-
-# Add optional bind mounts if they exist
-if [ -f "./postgres/init-create-empty-databases.sh" ]; then
-    BIND_MOUNTS="$BIND_MOUNTS --bind ./postgres/init-create-empty-databases.sh:/docker-entrypoint-initdb.d/init-create-empty-databases.sh"
-else
-    echo -e "${YELLOW}Warning: ./postgres/init-create-empty-databases.sh not found, skipping${NC}"
-fi
-
-if [ -d "./scripts" ]; then
-    BIND_MOUNTS="$BIND_MOUNTS --bind ./scripts:/scripts"
-else
-    echo -e "${YELLOW}Warning: ./scripts directory not found, skipping${NC}"
-fi
+BIND_MOUNTS="$BIND_MOUNTS --bind ./:/scripts"
 
 # Always bind INPUT_DIR
 BIND_MOUNTS="$BIND_MOUNTS --bind $INPUT_DIR:/data"
@@ -91,14 +102,14 @@ fi
 echo -e "${YELLOW}Starting PostgreSQL instance...${NC}"
 echo -e "${YELLOW}Using INPUT_DIR: $INPUT_DIR${NC}"
 
-singularity instance start $BIND_MOUNTS $POSTGRES_IMAGE $POSTGRES_INSTANCE
+$CONTAINER_CMD instance start $BIND_MOUNTS $POSTGRES_IMAGE $POSTGRES_INSTANCE
 
 # Start PostgreSQL inside the instance
 echo -e "${YELLOW}Starting PostgreSQL server inside instance...${NC}"
-singularity exec instance://$POSTGRES_INSTANCE bash -c "docker-entrypoint.sh postgres &"
+$CONTAINER_CMD exec instance://$POSTGRES_INSTANCE bash -c "docker-entrypoint.sh postgres &"
 
 # Get the PID of the instance (optional, for compatibility)
-INSTANCE_PID=$(pgrep -f "singularity.*$POSTGRES_INSTANCE" | head -1)
+INSTANCE_PID=$(pgrep -f "${CONTAINER_CMD}.*$POSTGRES_INSTANCE" | head -1)
 if [ -n "$INSTANCE_PID" ]; then
     echo $INSTANCE_PID > postgres.pid
 fi
@@ -106,9 +117,9 @@ fi
 # Wait for PostgreSQL to be ready
 echo -e "${YELLOW}Waiting for PostgreSQL to start...${NC}"
 for i in {1..30}; do
-    if singularity exec instance://$POSTGRES_INSTANCE pg_isready -h localhost -U postgres > /dev/null 2>&1; then
+    if $CONTAINER_CMD exec instance://$POSTGRES_INSTANCE pg_isready -h localhost -U postgres > /dev/null 2>&1; then
         echo -e "${GREEN}PostgreSQL is ready!${NC}"
-        echo -e "${GREEN}Connection: singularity exec instance://$POSTGRES_INSTANCE psql -h localhost -U postgres${NC}"
+        echo -e "${GREEN}Connection: $CONTAINER_CMD exec instance://$POSTGRES_INSTANCE psql -h localhost -U postgres${NC}"
         echo -e "${GREEN}Instance: $POSTGRES_INSTANCE${NC}"
         exit 0
     fi
@@ -117,5 +128,5 @@ for i in {1..30}; do
 done
 
 echo -e "${RED}PostgreSQL failed to start within 60 seconds${NC}"
-singularity instance stop $POSTGRES_INSTANCE 2>/dev/null || true
-exit 1
\ No newline at end of file
+$CONTAINER_CMD instance stop $POSTGRES_INSTANCE 2>/dev/null || true
+exit 1
diff --git a/scripts/postgres-status.sh b/scripts/postgres-status.sh
index e3669616..8be6b04e 100755
--- a/scripts/postgres-status.sh
+++ b/scripts/postgres-status.sh
@@ -1,6 +1,16 @@
 #!/bin/bash
 
-# PostgreSQL status script for Singularity
+# PostgreSQL status script for Singularity/Apptainer
+
+# Detect which container runtime is available
+if command -v apptainer &> /dev/null; then
+    CONTAINER_CMD="apptainer"
+elif command -v singularity &> /dev/null; then
+    CONTAINER_CMD="singularity"
+else
+    echo "ERROR: Neither apptainer nor singularity found in PATH"
+    exit 1
+fi
 
 # Colors for output
 RED='\033[0;31m'
@@ -31,20 +41,20 @@ if [ ! -f postgres.sif ]; then
     exit 1
 fi
 
-if singularity exec postgres.sif pg_isready -h localhost -U postgres > /dev/null 2>&1; then
+if $CONTAINER_CMD exec postgres.sif pg_isready -h localhost -U postgres > /dev/null 2>&1; then
     echo -e "${GREEN}✓ PostgreSQL is accepting connections${NC}"
-    echo -e "${GREEN}  Connection: singularity exec postgres.sif psql -h localhost -U postgres${NC}"
+    echo -e "${GREEN}  Connection: ${CONTAINER_CMD} exec postgres.sif psql -h localhost -U postgres${NC}"
     POSTGRES_RUNNING=true
     
     # Show database list
     echo -e "${YELLOW}Databases:${NC}"
-    singularity exec postgres.sif psql -h localhost -U postgres -c "\l" 2>/dev/null | \
+    $CONTAINER_CMD exec postgres.sif psql -h localhost -U postgres -c "\l" 2>/dev/null | \
         grep -v template | grep -v "^-" | grep -v "^(" | grep -v "Name.*Owner" | \
         grep -v "^\s*$" | head -10
     
     # Show PostgreSQL version
     echo -e "${YELLOW}Version:${NC}"
-    singularity exec postgres.sif psql -h localhost -U postgres -c "SELECT version();" -t 2>/dev/null | head -1
+    $CONTAINER_CMD exec postgres.sif psql -h localhost -U postgres -c "SELECT version();" -t 2>/dev/null | head -1
     
 else
     echo -e "${RED}✗ PostgreSQL is not accepting connections${NC}"
@@ -53,10 +63,10 @@ fi
 # Method 3: Check process list as fallback
 if [ "$POSTGRES_RUNNING" = false ]; then
     # Check for any postgres-related processes with more flexible patterns
-    if pgrep -f "postgres" > /dev/null || pgrep -f "singularity.*postgres" > /dev/null; then
+    if pgrep -f "postgres" > /dev/null || pgrep -f "${CONTAINER_CMD}.*postgres" > /dev/null; then
         echo -e "${YELLOW}! Found postgres-related process but cannot connect${NC}"
         echo -e "${YELLOW}  Process list:${NC}"
-        ps aux | grep -E "(postgres|singularity)" | grep -v grep | head -5
+        ps aux | grep -E "(postgres|${CONTAINER_CMD})" | grep -v grep | head -5
     else
         echo -e "${RED}✗ No PostgreSQL processes found${NC}"
     fi
diff --git a/scripts/postgres-stop.sh b/scripts/postgres-stop.sh
index 729e8e25..6455df4e 100755
--- a/scripts/postgres-stop.sh
+++ b/scripts/postgres-stop.sh
@@ -1,6 +1,16 @@
 #!/bin/bash
 
-# PostgreSQL shutdown script for Singularity
+# PostgreSQL shutdown script for Singularity/Apptainer
+
+# Detect which container runtime is available
+if command -v apptainer &> /dev/null; then
+    CONTAINER_CMD="apptainer"
+elif command -v singularity &> /dev/null; then
+    CONTAINER_CMD="singularity"
+else
+    echo "ERROR: Neither apptainer nor singularity found in PATH"
+    exit 1
+fi
 
 # Colors for output
 RED='\033[0;31m'
@@ -12,10 +22,10 @@ echo -e "${YELLOW}Stopping PostgreSQL...${NC}"
 
 STOPPED=false
 
-# Method 1: Stop Singularity instance if it exists
-if singularity instance list | grep -q "postgres_instance"; then
-    echo -e "${YELLOW}Stopping Singularity instance: postgres_instance${NC}"
-    singularity instance stop postgres_instance
+# Method 1: Stop instance if it exists
+if $CONTAINER_CMD instance list | grep -q "postgres_instance"; then
+    echo -e "${YELLOW}Stopping ${CONTAINER_CMD} instance: postgres_instance${NC}"
+    $CONTAINER_CMD instance stop postgres_instance
     STOPPED=true
 fi
 
@@ -44,8 +54,8 @@ if [ -f postgres.pid ]; then
     rm postgres.pid
 fi
 
-# Method 3: Fallback - kill any singularity postgres processes
-if pkill -f "singularity.*postgres"; then
+# Method 3: Fallback - kill any container postgres processes
+if pkill -f "${CONTAINER_CMD}.*postgres"; then
     echo -e "${YELLOW}Killed remaining PostgreSQL processes${NC}"
     STOPPED=true
 fi

From 4d52f2e3b18cd3d94df9a81b852537fa47bc0ae7 Mon Sep 17 00:00:00 2001
From: Lorenzo Guerra <lorenzo.guerra@telecom-paris.fr>
Date: Thu, 23 Oct 2025 17:07:48 +0200
Subject: [PATCH 32/33] =?UTF-8?q?Important=20fix:=20Velox=E2=80=99s=20Line?=
 =?UTF-8?q?arEncoder=20now=20correctly=20receives=20a=20tuple=20of=20the?=
 =?UTF-8?q?=20x=5Fsrc=20and=20x=5Fdst=20input=20tensors.=20Previously,=20x?=
 =?UTF-8?q?=5Fis=5Ftuple=20was=20always=20False=20by=20default,=20causing?=
 =?UTF-8?q?=20x=5Fsrc=20and=20x=5Fdst=20to=20be=20mixed=20up=20by=20torch.?=
 =?UTF-8?q?scatter.=20This=20led=20to=20the=20node=20pairs=20being=20merge?=
 =?UTF-8?q?d,=20resulting=20in=20a=20single=20tensor=20of=20shape=20(N,=20?=
 =?UTF-8?q?d)=20inside=20LinearEncoder=20instead=20of=20two=20separate=20t?=
 =?UTF-8?q?ensors.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 config/default.yml                   |  1 +
 config/velox.yml                     |  1 +
 pidsmaker/config/config.py           |  1 +
 pidsmaker/encoders/linear_encoder.py |  3 ++-
 pidsmaker/model.py                   | 13 ++++++++-----
 pidsmaker/utils/data_utils.py        | 20 +++++++++++---------
 6 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/config/default.yml b/config/default.yml
index 544bd512..09910c33 100644
--- a/config/default.yml
+++ b/config/default.yml
@@ -92,6 +92,7 @@ detection:
         activation: prelu
       custom_mlp:
         architecture_str: none
+      x_is_tuple: False
     decoder:
       predict_edge_type:
         decoder: edge_mlp
diff --git a/config/velox.yml b/config/velox.yml
index 372f0b23..98892957 100644
--- a/config/velox.yml
+++ b/config/velox.yml
@@ -9,3 +9,4 @@ detection:
   gnn_training:
     encoder:
       used_methods: none # uses only a linear layer
+      x_is_tuple: True
diff --git a/pidsmaker/config/config.py b/pidsmaker/config/config.py
index b80d3f85..d0b05caa 100644
--- a/pidsmaker/config/config.py
+++ b/pidsmaker/config/config.py
@@ -730,6 +730,7 @@ def __init__(self, type, vals: list = None, desc: str = None):
                     vals=AND(list(ENCODERS_CFG.keys())),
                     desc="First part of the neural network. Usually GNN encoders to capture complex patterns.",
                 ),
+                "x_is_tuple": Arg(bool),
                 **ENCODERS_CFG,
             },
             "decoder": {
diff --git a/pidsmaker/encoders/linear_encoder.py b/pidsmaker/encoders/linear_encoder.py
index 5714993e..2f5dbe10 100644
--- a/pidsmaker/encoders/linear_encoder.py
+++ b/pidsmaker/encoders/linear_encoder.py
@@ -8,7 +8,8 @@ def __init__(self, in_dim, out_dim, dropout=0.0):
         self.dropout = nn.Dropout(dropout)
 
     def forward(self, x, *args, **kwargs):
-        if isinstance(x, tuple):
+        # Handle both tuples and lists (PyG batching may convert tuples to lists)
+        if isinstance(x, (tuple, list)):
             h = self.dropout(self.lin1(x[0])), self.dropout(self.lin1(x[1]))
         else:
             h = self.dropout(self.lin1(x))
diff --git a/pidsmaker/model.py b/pidsmaker/model.py
index 0a09eb5f..9eb7a54a 100644
--- a/pidsmaker/model.py
+++ b/pidsmaker/model.py
@@ -129,11 +129,14 @@ def gather_h(self, batch, res):
         h_dst = res.get("h_dst", None)
 
         if None in [h_src, h_dst]:
-            h_src, h_dst = (
-                (h[batch.edge_index[0]], h[batch.edge_index[1]])
-                if isinstance(h, torch.Tensor)
-                else h
-            )
+            if isinstance(h, torch.Tensor):
+                # h is a single tensor with node embeddings - index by edge_index
+                h_src, h_dst = h[batch.edge_index[0]], h[batch.edge_index[1]]
+            elif isinstance(h, (tuple, list)):
+                # h is (h_src_nodes, h_dst_nodes) with separate node embeddings - index each
+                h_src, h_dst = h[0][batch.edge_index[0]], h[1][batch.edge_index[1]]
+            else:
+                h_src, h_dst = h
 
         return h, h_src, h_dst
 
diff --git a/pidsmaker/utils/data_utils.py b/pidsmaker/utils/data_utils.py
index 0b914a76..16dfa835 100644
--- a/pidsmaker/utils/data_utils.py
+++ b/pidsmaker/utils/data_utils.py
@@ -522,7 +522,13 @@ def run_reindexing_preprocessing(datasets, graph_reindexer, device, cfg):
         log_dataset_stats(datasets)
         # By default we only have x_src and x_dst of shape (E, d), here we create x of shape (N, d)
         use_tgn = "tgn" in cfg.detection.gnn_training.encoder.used_methods
-        reindex_graphs(datasets, graph_reindexer, device, use_tgn)
+        reindex_graphs(
+            datasets,
+            graph_reindexer,
+            device,
+            use_tgn,
+            x_is_tuple=cfg.detection.gnn_training.encoder.x_is_tuple,
+        )
 
     return datasets
 
@@ -782,7 +788,7 @@ def node_features_reshape(self, edge_index, x_src, x_dst, max_num_node=None, x_i
 
             scatter(x_dst, edge_index[1], out=output, dim=0, reduce="mean")
             x_dst_result = output.clone()
-            return x_src_result[:max_num_node], x_dst_result[:max_num_node]
+            return (x_src_result[:max_num_node], x_dst_result[:max_num_node])
         else:
             if self.fix_buggy_graph_reindexer:
                 output = output.clone()
@@ -811,15 +817,11 @@ def reindex_graph(self, data, x_is_tuple=False, use_tgn=False):
             data.edge_index, data.x_src, data.x_dst, x_is_tuple=x_is_tuple
         )
         data.original_n_id = n_id
+        data.x = x
 
         if not use_tgn:
             data.src, data.dst = edge_index[0], edge_index[1]
 
-        if x_is_tuple:
-            data.x_src, data.x_dst = x
-        else:
-            data.x = x
-
         data.node_type, *_ = self._reindex_graph(
             data.edge_index, data.node_type_src, data.node_type_dst, x_is_tuple=False
         )
@@ -912,10 +914,10 @@ def load_model(model, path: str, cfg, map_location=None):
     return model
 
 
-def reindex_graphs(datasets, graph_reindexer, device, use_tgn):
+def reindex_graphs(datasets, graph_reindexer, device, use_tgn, x_is_tuple=False):
     for dataset in datasets:
         for data_list in dataset:
             for batch in log_tqdm(data_list, desc="Reindexing graphs"):
                 batch.to(device)
-                graph_reindexer.reindex_graph(batch, use_tgn=use_tgn)
+                graph_reindexer.reindex_graph(batch, use_tgn=use_tgn, x_is_tuple=x_is_tuple)
                 batch.to("cpu")

From b68ef86eb649d8588a6f22609cdd394e5cadc814 Mon Sep 17 00:00:00 2001
From: tristan <bilot.tristan@hotmail.fr>
Date: Fri, 24 Oct 2025 00:37:44 +0000
Subject: [PATCH 33/33] add missing desc

---
 pidsmaker/config/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pidsmaker/config/config.py b/pidsmaker/config/config.py
index d0b05caa..15c27b49 100644
--- a/pidsmaker/config/config.py
+++ b/pidsmaker/config/config.py
@@ -730,7 +730,7 @@ def __init__(self, type, vals: list = None, desc: str = None):
                     vals=AND(list(ENCODERS_CFG.keys())),
                     desc="First part of the neural network. Usually GNN encoders to capture complex patterns.",
                 ),
-                "x_is_tuple": Arg(bool),
+                "x_is_tuple": Arg(bool, desc="Whether to consider nodes differently when being source or destination."),
                 **ENCODERS_CFG,
             },
             "decoder": {