From 424d3cdee59b5349553e72868d14ad6cda63680e Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Tue, 1 Jul 2025 11:23:39 +0200 Subject: [PATCH 01/33] added conda environment, adapted code to pytorch 2.6 and added Singularity setup scripts for postgres as an alternative to docker containers for cluster environments --- .gitignore | 10 +++ Makefile | 64 ++++++++++++++++ environment.yaml | 184 +++++++++++++++++++++++++++++++++++++++++++++ pidsmaker/main.py | 29 ++++++- postgres-start.sh | 121 +++++++++++++++++++++++++++++ postgres-status.sh | 80 ++++++++++++++++++++ postgres-stop.sh | 57 ++++++++++++++ 7 files changed, 541 insertions(+), 4 deletions(-) create mode 100644 Makefile create mode 100644 environment.yaml create mode 100755 postgres-start.sh create mode 100755 postgres-status.sh create mode 100755 postgres-stop.sh diff --git a/.gitignore b/.gitignore index 3ab4b839..a84235d0 100644 --- a/.gitignore +++ b/.gitignore @@ -169,3 +169,13 @@ nohup.out artifacts/ data/ docs/site/ +postgres_data/ +postgres_lock/ + +# Singularity files +*.sif +*.def + +# Postgres directories +postgres_config/ +postgres_run/ diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..cc32c1b5 --- /dev/null +++ b/Makefile @@ -0,0 +1,64 @@ +# Makefile for Singularity PostgreSQL management + +.PHONY: up down status load-dumps full-setup logs clean help + +# PostgreSQL management +up: + @./postgres-start.sh + +down: + @./postgres-stop.sh + +status: + @./postgres-status.sh + +logs: + @echo "PostgreSQL logs:" + @tail -f postgres_log/postgresql*.log 2>/dev/null || echo "No logs found" + +clean: down + @echo "Cleaning up PostgreSQL data..." + @rm -rf postgres_data postgres_run postgres_log + @echo "PostgreSQL data cleaned" + +reset: clean up + +app-build: + @echo "Building PIDSMaker container..." + @singularity build pidsmaker.sif pidsmaker.def || echo "Build failed - check if you have fakeroot access" + +app-run: up + @echo "Running PIDSMaker application..." + @singularity run --nv \ + --env DB_HOST=localhost \ + --env DOCKER_PORT=5432 \ + --env DB_USER=postgres \ + --env DB_PASSWORD=postgres \ + --bind ${PWD}:/workspace \ + pidsmaker.sif + +load-dumps: up + @echo "Loading database dumps from inside container..." + @if [ -f "./settings/scripts/load_dumps.sh" ]; then \ + echo "Found load_dumps.sh, executing inside container..."; \ + singularity exec instance://postgres_instance /scripts/load_dumps.sh; \ + else \ + echo "Error: ./settings/scripts/load_dumps.sh not found"; \ + exit 1; \ + fi + +full-setup: up load-dumps + @echo "PostgreSQL setup complete with dumps loaded" + +help: + @echo "Available commands:" + @echo " postgres-up - Start PostgreSQL" + @echo " postgres-down - Stop PostgreSQL" + @echo " postgres-status - Check PostgreSQL status" + @echo " postgres-logs - Show PostgreSQL logs" + @echo " postgres-load-dumps - Load database dumps" + @echo " postgres-full-setup - Start PostgreSQL and load dumps" + @echo " postgres-clean - Stop and remove all data" + @echo " postgres-reset - Clean and restart PostgreSQL" + @echo " app-build - Build PIDSMaker container" + @echo " app-run - Run PIDSMaker with PostgreSQL" diff --git a/environment.yaml b/environment.yaml new file mode 100644 index 00000000..7d0678d1 --- /dev/null +++ b/environment.yaml @@ -0,0 +1,184 @@ +name: pids +channels: + - conda-forge +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=3_kmp_llvm + - bzip2=1.0.8=h4bc722e_7 + - ca-certificates=2025.6.15=hbd8a1cb_0 + - certifi=2025.6.15=pyhd8ed1ab_0 + - colorama=0.4.6=pyhd8ed1ab_1 + - cyrus-sasl=2.1.28=hd9c7081_0 + - icu=75.1=he02047a_0 + - keyutils=1.6.1=h166bdaf_0 + - krb5=1.21.3=h659f571_0 + - lcms2=2.17=h717163a_0 + - ld_impl_linux-64=2.43=h1423503_5 + - lerc=4.0.0=h0aef613_1 + - libdeflate=1.24=h86f0d12_0 + - libedit=3.1.20250104=pl5321h7949ede_0 + - libexpat=2.7.0=h5888daf_0 + - libffi=3.4.6=h2dba641_1 + - libfreetype=2.13.3=ha770c72_1 + - libfreetype6=2.13.3=h48d6fc4_1 + - libgcc=15.1.0=h767d61c_3 + - libgcc-ng=15.1.0=h69a702a_3 + - libgomp=15.1.0=h767d61c_3 + - libjpeg-turbo=3.1.0=hb9d3cd8_0 + - liblzma=5.8.1=hb9d3cd8_2 + - libnsl=2.0.1=hb9d3cd8_1 + - libntlm=1.8=hb9d3cd8_0 + - libpng=1.6.49=h943b412_0 + - libpq=17.5=h27ae623_0 + - libsqlite=3.50.1=h6cd9bfd_7 + - libstdcxx=15.1.0=h8f9b012_3 + - libstdcxx-ng=15.1.0=h4852527_3 + - libtiff=4.7.0=hf01ce69_5 + - libuuid=2.38.1=h0b41bf4_0 + - libwebp-base=1.5.0=h851e524_0 + - libxcb=1.17.0=h8a09558_0 + - libxcrypt=4.4.36=hd590300_1 + - libzlib=1.3.1=hb9d3cd8_2 + - llvm-openmp=20.1.7=h024ca30_0 + - ncurses=6.5=h2d0b736_3 + - openjpeg=2.5.3=h5fbd93e_0 + - openldap=2.6.10=he970967_0 + - openssl=3.5.0=h7b32b05_1 + - pillow=11.2.1=py39h15c0740_0 + - pip=25.1.1=pyh8b19718_0 + - psycopg2=2.9.10=py39h2bc273e_1 + - pthread-stubs=0.4=hb9d3cd8_1002 + - python=3.9.23=hc30ae73_0_cpython + - python_abi=3.9=7_cp39 + - readline=8.2=h8c095d6_2 + - tk=8.6.13=noxft_hd72426e_102 + - tqdm=4.67.1=pyhd8ed1ab_1 + - wheel=0.45.1=pyhd8ed1ab_1 + - xorg-libxau=1.0.12=hb9d3cd8_0 + - xorg-libxdmcp=1.1.5=hb9d3cd8_0 + - zstd=1.5.7=hb8e6e7a_2 + - pip: + - aiohappyeyeballs==2.6.1 + - aiohttp==3.12.13 + - aiosignal==1.3.2 + - appdirs==1.4.4 + - async-timeout==5.0.1 + - attrs==25.3.0 + - babel==2.17.0 + - backrefs==5.9 + - beautifulsoup4==4.13.4 + - cairocffi==1.7.0 + - cfgv==3.4.0 + - chardet==5.2.0 + - charset-normalizer==3.4.2 + - click==8.1.8 + - contourpy==1.3.0 + - coverage==7.9.1 + - cycler==0.12.1 + - distlib==0.3.9 + - docker-pycreds==0.4.0 + - exceptiongroup==1.3.0 + - filelock==3.18.0 + - fonttools==4.58.4 + - frozenlist==1.7.0 + - fsspec==2025.5.1 + - gdown==5.2.0 + - gensim==4.3.1 + - ghp-import==2.1.0 + - gitdb==4.0.12 + - gitpython==3.1.44 + - graphviz==0.20.1 + - identify==2.6.12 + - idna==3.10 + - igraph==0.11.5 + - importlib-metadata==8.7.0 + - importlib-resources==6.5.2 + - iniconfig==2.1.0 + - jinja2==3.1.6 + - joblib==1.5.1 + - kiwisolver==1.4.7 + - markdown==3.8.2 + - markupsafe==3.0.2 + - matplotlib==3.8.4 + - mergedeep==1.3.4 + - mkdocs==1.6.1 + - mkdocs-get-deps==0.2.0 + - mkdocs-glightbox==0.4.0 + - mkdocs-material==9.6.12 + - mkdocs-material-extensions==1.3.1 + - mpmath==1.3.0 + - multidict==6.5.1 + - networkx==2.8.7 + - nltk==3.8.1 + - nodeenv==1.9.1 + - numpy==1.26.4 + - nvidia-cublas-cu12==12.6.4.1 + - nvidia-cuda-cupti-cu12==12.6.80 + - nvidia-cuda-nvrtc-cu12==12.6.77 + - nvidia-cuda-runtime-cu12==12.6.77 + - nvidia-cudnn-cu12==9.5.1.17 + - nvidia-cufft-cu12==11.3.0.4 + - nvidia-cufile-cu12==1.11.1.6 + - nvidia-curand-cu12==10.3.7.77 + - nvidia-cusolver-cu12==11.7.1.2 + - nvidia-cusparse-cu12==12.5.4.2 + - nvidia-cusparselt-cu12==0.6.3 + - nvidia-nccl-cu12==2.26.2 + - nvidia-nvjitlink-cu12==12.6.85 + - nvidia-nvtx-cu12==12.6.77 + - packaging==25.0 + - paginate==0.5.7 + - pandas==2.2.2 + - pathspec==0.12.1 + - platformdirs==4.3.8 + - pluggy==1.6.0 + - pre-commit==4.2.0 + - propcache==0.3.2 + - protobuf==4.25.8 + - psutil==7.0.0 + - pyg-lib==0.4.0+pt27cu126 + - pygments==2.19.2 + - pymdown-extensions==10.16 + - pyparsing==3.2.3 + - pytest==8.3.5 + - pytest-cov==6.1.1 + - python-dateutil==2.9.0.post0 + - pytz==2024.1 + - pyyaml==6.0.2 + - pyyaml-env-tag==1.1 + - regex==2024.11.6 + - requests==2.32.4 + - scikit-learn==1.2.0 + - scipy==1.10.1 + - sentry-sdk==2.31.0 + - setproctitle==1.3.6 + - setuptools==61.0.0 + - six==1.17.0 + - smart-open==7.1.0 + - smmap==5.0.2 + - soupsieve==2.7 + - sympy==1.14.0 + - texttable==1.7.0 + - threadpoolctl==3.6.0 + - tomli==2.2.1 + - torch==2.7.1 + - torch-cluster==1.6.3+pt27cu126 + - torch-geometric==2.5.3 + - torch-scatter==2.1.2+pt27cu126 + - torch-sparse==0.6.18+pt27cu126 + - torch-spline-conv==1.2.2+pt27cu126 + - torchaudio==2.7.1 + - torchvision==0.22.1 + - triton==3.3.1 + - typing-extensions==4.14.0 + - tzdata==2025.2 + - urllib3==2.5.0 + - virtualenv==20.31.2 + - wandb==0.16.6 + - watchdog==6.0.0 + - wget==3.2 + - wrapt==1.17.2 + - xxhash==3.2.0 + - yacs==0.1.8 + - yarl==1.20.1 + - zipp==3.23.0 diff --git a/pidsmaker/main.py b/pidsmaker/main.py index 9fcf694e..561c59c3 100644 --- a/pidsmaker/main.py +++ b/pidsmaker/main.py @@ -5,7 +5,9 @@ import time from collections import defaultdict +import networkx as nx import torch +import torch_geometric import wandb from pidsmaker.config import ( @@ -20,7 +22,10 @@ gnn_training, graph_preprocessing, ) -from pidsmaker.experiments.tuning import fuse_cfg_with_sweep_cfg, get_tuning_sweep_cfg +from pidsmaker.experiments.tuning import ( + fuse_cfg_with_sweep_cfg, + get_tuning_sweep_cfg, +) from pidsmaker.experiments.uncertainty import ( avg_std_metrics, fuse_hyperparameter_metrics, @@ -40,8 +45,17 @@ from pidsmaker.triage import ( tracing, ) +from pidsmaker.utils.data_utils import CollatableTemporalData from pidsmaker.utils.utils import log, remove_underscore_keys, set_seed +torch.serialization.add_safe_globals( + [ + nx.classes.multidigraph.MultiDiGraph, + CollatableTemporalData, + torch_geometric.data.storage.GlobalStorage, + ] +) + def get_task_to_module(cfg): return { @@ -159,7 +173,10 @@ def run_pipeline_with_experiments(cfg): hyper_to_metrics = defaultdict(list) for hyper in hyperparameters: - log(f"[@hyperparameter {hyper}] - Started", pre_return_line=True) + log( + f"[@hyperparameter {hyper}] - Started", + pre_return_line=True, + ) for i in range(iterations): log(f"[@iteration {i}]", pre_return_line=True) @@ -184,7 +201,11 @@ def run_pipeline_with_experiments(cfg): for i in range(iterations): log(f"[@iteration {i}]", pre_return_line=True) cfg = update_cfg_for_uncertainty_exp( - method, i, iterations, copy.deepcopy(original_cfg), hyperparameter=None + method, + i, + iterations, + copy.deepcopy(original_cfg), + hyperparameter=None, ) metrics, times = run_pipeline(cfg, method=method, iteration=i) method_to_metrics[method].append({**metrics, **times}) @@ -261,7 +282,7 @@ def run_pipeline_from_sweep(cfg): project = "PIDSMaker" wandb.init( - mode="online" if (args.wandb and args.tuning_mode == "none") else "disabled", + mode=("online" if (args.wandb and args.tuning_mode == "none") else "disabled"), project=project, name=exp_name, tags=tags, diff --git a/postgres-start.sh b/postgres-start.sh new file mode 100755 index 00000000..1a5eb78b --- /dev/null +++ b/postgres-start.sh @@ -0,0 +1,121 @@ +#!/bin/bash + +# PostgreSQL startup script for Singularity + +set -e + +# Configuration +POSTGRES_IMAGE="postgres.sif" +POSTGRES_INSTANCE="postgres_instance" +DATA_DIR="postgres_data" +RUN_DIR="postgres_run" +LOG_DIR="postgres_log" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo -e "${YELLOW}Starting PostgreSQL with Singularity...${NC}" + +# Check if postgres.sif exists +if [ ! -f "$POSTGRES_IMAGE" ]; then + echo -e "${YELLOW}PostgreSQL image not found. Pulling from Docker Hub...${NC}" + singularity pull $POSTGRES_IMAGE docker://postgres:17 +fi + +# Create necessary directories +echo -e "${YELLOW}Creating directories...${NC}" +mkdir -p $DATA_DIR $RUN_DIR $LOG_DIR + +# Create INPUT_DIR if it doesn't exist +INPUT_DIR=${INPUT_DIR:-$(pwd)/data} +if [ ! -d "$INPUT_DIR" ]; then + echo -e "${YELLOW}Creating INPUT_DIR: $INPUT_DIR${NC}" + mkdir -p "$INPUT_DIR" +fi + +# Check if instance already exists +if singularity instance list | grep -q "$POSTGRES_INSTANCE"; then + echo -e "${YELLOW}PostgreSQL instance $POSTGRES_INSTANCE already exists${NC}" + # Check if it's responsive + if singularity exec instance://$POSTGRES_INSTANCE pg_isready -h localhost -U postgres > /dev/null 2>&1; then + echo -e "${GREEN}PostgreSQL instance is already running and responsive${NC}" + exit 0 + else + echo -e "${YELLOW}Instance exists but not responsive, stopping it...${NC}" + singularity instance stop $POSTGRES_INSTANCE + sleep 2 + fi +fi + +# Check if any other postgres processes are running +if pgrep -f "singularity.*postgres" > /dev/null; then + echo -e "${YELLOW}Other PostgreSQL processes detected, cleaning up...${NC}" + pkill -f "singularity.*postgres" || true + sleep 2 +fi + +# Set environment variables +export SINGULARITYENV_POSTGRES_PASSWORD=postgres +export SINGULARITYENV_POSTGRES_USER=postgres +export SINGULARITYENV_POSTGRES_DB=postgres + +# Prepare bind mounts - only bind if files/directories exist +BIND_MOUNTS="--bind $DATA_DIR:/var/lib/postgresql/data" +BIND_MOUNTS="$BIND_MOUNTS --bind $RUN_DIR:/var/run/postgresql" +BIND_MOUNTS="$BIND_MOUNTS --bind $LOG_DIR:/var/log" + +# Add optional bind mounts if they exist +if [ -f "./postgres/init-create-empty-databases.sh" ]; then + BIND_MOUNTS="$BIND_MOUNTS --bind ./postgres/init-create-empty-databases.sh:/docker-entrypoint-initdb.d/init-create-empty-databases.sh" +else + echo -e "${YELLOW}Warning: ./postgres/init-create-empty-databases.sh not found, skipping${NC}" +fi + +if [ -d "./settings/scripts" ]; then + BIND_MOUNTS="$BIND_MOUNTS --bind ./settings/scripts:/scripts" +else + echo -e "${YELLOW}Warning: ./settings/scripts directory not found, skipping${NC}" +fi + +# Always bind INPUT_DIR +BIND_MOUNTS="$BIND_MOUNTS --bind $INPUT_DIR:/data" + +if [ -f "./postgres_config/postgresql.conf" ]; then + BIND_MOUNTS="$BIND_MOUNTS --bind ./postgres_config/postgresql.conf:/etc/postgresql/postgresql.conf" +fi + +# Start PostgreSQL instance +echo -e "${YELLOW}Starting PostgreSQL instance...${NC}" +echo -e "${YELLOW}Using INPUT_DIR: $INPUT_DIR${NC}" + +singularity instance start $BIND_MOUNTS $POSTGRES_IMAGE $POSTGRES_INSTANCE + +# Start PostgreSQL inside the instance +echo -e "${YELLOW}Starting PostgreSQL server inside instance...${NC}" +singularity exec instance://$POSTGRES_INSTANCE bash -c "docker-entrypoint.sh postgres &" + +# Get the PID of the instance (optional, for compatibility) +INSTANCE_PID=$(pgrep -f "singularity.*$POSTGRES_INSTANCE" | head -1) +if [ -n "$INSTANCE_PID" ]; then + echo $INSTANCE_PID > postgres.pid +fi + +# Wait for PostgreSQL to be ready +echo -e "${YELLOW}Waiting for PostgreSQL to start...${NC}" +for i in {1..30}; do + if singularity exec instance://$POSTGRES_INSTANCE pg_isready -h localhost -U postgres > /dev/null 2>&1; then + echo -e "${GREEN}PostgreSQL is ready!${NC}" + echo -e "${GREEN}Connection: singularity exec instance://$POSTGRES_INSTANCE psql -h localhost -U postgres${NC}" + echo -e "${GREEN}Instance: $POSTGRES_INSTANCE${NC}" + exit 0 + fi + echo -n "." + sleep 2 +done + +echo -e "${RED}PostgreSQL failed to start within 60 seconds${NC}" +singularity instance stop $POSTGRES_INSTANCE 2>/dev/null || true +exit 1 \ No newline at end of file diff --git a/postgres-status.sh b/postgres-status.sh new file mode 100755 index 00000000..e3669616 --- /dev/null +++ b/postgres-status.sh @@ -0,0 +1,80 @@ +#!/bin/bash + +# PostgreSQL status script for Singularity + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo -e "${YELLOW}PostgreSQL Status:${NC}" + +# Check multiple ways to detect if PostgreSQL is running +POSTGRES_RUNNING=false + +# Method 1: Check PID file +if [ -f postgres.pid ]; then + PID=$(cat postgres.pid) + if kill -0 $PID 2>/dev/null; then + echo -e "${GREEN}✓ PostgreSQL process is running (PID: $PID)${NC}" + POSTGRES_RUNNING=true + else + echo -e "${YELLOW}! PID file exists but process is not running${NC}" + rm postgres.pid + fi +fi + +# Method 2: Check if we can connect (most reliable test) +if [ ! -f postgres.sif ]; then + echo -e "${RED}✗ postgres.sif not found${NC}" + exit 1 +fi + +if singularity exec postgres.sif pg_isready -h localhost -U postgres > /dev/null 2>&1; then + echo -e "${GREEN}✓ PostgreSQL is accepting connections${NC}" + echo -e "${GREEN} Connection: singularity exec postgres.sif psql -h localhost -U postgres${NC}" + POSTGRES_RUNNING=true + + # Show database list + echo -e "${YELLOW}Databases:${NC}" + singularity exec postgres.sif psql -h localhost -U postgres -c "\l" 2>/dev/null | \ + grep -v template | grep -v "^-" | grep -v "^(" | grep -v "Name.*Owner" | \ + grep -v "^\s*$" | head -10 + + # Show PostgreSQL version + echo -e "${YELLOW}Version:${NC}" + singularity exec postgres.sif psql -h localhost -U postgres -c "SELECT version();" -t 2>/dev/null | head -1 + +else + echo -e "${RED}✗ PostgreSQL is not accepting connections${NC}" +fi + +# Method 3: Check process list as fallback +if [ "$POSTGRES_RUNNING" = false ]; then + # Check for any postgres-related processes with more flexible patterns + if pgrep -f "postgres" > /dev/null || pgrep -f "singularity.*postgres" > /dev/null; then + echo -e "${YELLOW}! Found postgres-related process but cannot connect${NC}" + echo -e "${YELLOW} Process list:${NC}" + ps aux | grep -E "(postgres|singularity)" | grep -v grep | head -5 + else + echo -e "${RED}✗ No PostgreSQL processes found${NC}" + fi +fi + +# Method 4: Check if port 5432 is listening +if ss -tlnp 2>/dev/null | grep -q ":5432 " || netstat -tlnp 2>/dev/null | grep -q ":5432 "; then + echo -e "${GREEN}✓ Port 5432 is listening${NC}" + POSTGRES_RUNNING=true +else + echo -e "${RED}✗ Port 5432 is not listening${NC}" +fi + +# Final status +if [ "$POSTGRES_RUNNING" = true ]; then + echo -e "${GREEN}Overall Status: PostgreSQL is running and accessible${NC}" + exit 0 +else + echo -e "${RED}Overall Status: PostgreSQL is not running or not accessible${NC}" + exit 1 +fi \ No newline at end of file diff --git a/postgres-stop.sh b/postgres-stop.sh new file mode 100755 index 00000000..729e8e25 --- /dev/null +++ b/postgres-stop.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# PostgreSQL shutdown script for Singularity + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo -e "${YELLOW}Stopping PostgreSQL...${NC}" + +STOPPED=false + +# Method 1: Stop Singularity instance if it exists +if singularity instance list | grep -q "postgres_instance"; then + echo -e "${YELLOW}Stopping Singularity instance: postgres_instance${NC}" + singularity instance stop postgres_instance + STOPPED=true +fi + +# Method 2: Stop using PID file if it exists +if [ -f postgres.pid ]; then + PID=$(cat postgres.pid) + if kill -0 $PID 2>/dev/null; then + echo -e "${YELLOW}Stopping PostgreSQL process (PID: $PID)${NC}" + kill $PID + # Wait for graceful shutdown + for i in {1..10}; do + if ! kill -0 $PID 2>/dev/null; then + echo -e "${GREEN}PostgreSQL stopped gracefully${NC}" + STOPPED=true + break + fi + sleep 1 + done + # Force kill if still running + if kill -0 $PID 2>/dev/null; then + echo -e "${YELLOW}Force killing PostgreSQL${NC}" + kill -9 $PID + STOPPED=true + fi + fi + rm postgres.pid +fi + +# Method 3: Fallback - kill any singularity postgres processes +if pkill -f "singularity.*postgres"; then + echo -e "${YELLOW}Killed remaining PostgreSQL processes${NC}" + STOPPED=true +fi + +if [ "$STOPPED" = true ]; then + echo -e "${GREEN}PostgreSQL stopped${NC}" +else + echo -e "${YELLOW}No PostgreSQL instances were found running${NC}" +fi \ No newline at end of file From 96d300000e0d18f0f3901e6a2d1102f0377030e0 Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Tue, 1 Jul 2025 11:24:13 +0200 Subject: [PATCH 02/33] made load_dumps.sh idempotent --- settings/scripts/load_dumps.sh | 56 ++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/settings/scripts/load_dumps.sh b/settings/scripts/load_dumps.sh index 7c56e036..b476e228 100755 --- a/settings/scripts/load_dumps.sh +++ b/settings/scripts/load_dumps.sh @@ -1,9 +1,59 @@ #!/bin/bash +set -e # Exit on any error + +echo "Starting database dump restoration..." + for dump_file in /data/*.dump; do + # Check if any dump files exist + if [ ! -f "$dump_file" ]; then + echo "No .dump files found in /data/ directory" + break + fi + db_name=$(basename "$dump_file" .dump) - + + echo "Processing $dump_file -> database '$db_name'" + + # Check if database already exists and has data + if psql -U postgres -h localhost -p 5432 -lqt | cut -d \| -f 1 | grep -qw "$db_name"; then + echo "Database '$db_name' already exists. Checking if it has data..." + + # Count tables in the database + table_count=$(psql -U postgres -h localhost -p 5432 -d "$db_name" -t -c "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'public';" 2>/dev/null || echo "0") + + if [ "$table_count" -gt 0 ]; then + echo "✓ Database '$db_name' already has $table_count tables. Skipping restoration." + continue + else + echo "Database '$db_name' exists but is empty. Proceeding with restoration..." + fi + else + # Create database if it doesn't exist + echo "Creating database '$db_name'..." + psql -U postgres -h localhost -p 5432 -c "CREATE DATABASE \"$db_name\";" 2>/dev/null || { + echo "Warning: Could not create database '$db_name' (may already exist)" + } + fi + echo "Restoring $dump_file into database '$db_name'..." - - pg_restore -U postgres -h localhost -p 5432 -d "$db_name" "$dump_file" + + # Use --clean --if-exists to handle existing objects gracefully + if pg_restore -U postgres -h localhost -p 5432 --clean --if-exists --no-owner --no-privileges -d "$db_name" "$dump_file" 2>/dev/null; then + echo "✓ Successfully restored $dump_file" + + # Verify restoration + final_table_count=$(psql -U postgres -h localhost -p 5432 -d "$db_name" -t -c "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'public';" 2>/dev/null || echo "0") + echo " Database '$db_name' now has $final_table_count tables" + else + echo "✗ Warning: pg_restore reported errors for $dump_file (this may be normal for some dump formats)" + fi + + echo "" done + +echo "Database dump restoration completed!" + +# Show summary of all databases +echo "Summary of available databases:" +psql -U postgres -h localhost -p 5432 -c "\l" | grep -E "^\s+[a-zA-Z]" | head -20 From bf55281058ebd31a7841997db06cb792908baa5c Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Tue, 1 Jul 2025 11:25:15 +0200 Subject: [PATCH 03/33] changed default artifact dir and postgres host to localhost (for cluster setup with conda and singularity) --- pidsmaker/config/pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pidsmaker/config/pipeline.py b/pidsmaker/config/pipeline.py index 7112751e..f51aafa3 100644 --- a/pidsmaker/config/pipeline.py +++ b/pidsmaker/config/pipeline.py @@ -24,13 +24,13 @@ Arg, ) -DEFAULT_ROOT_ARTIFACT_DIR = "/home/artifacts/" # Destination folder (in the container) for generated files. Will be created if doesn't exist. +DEFAULT_ROOT_ARTIFACT_DIR = "./artifacts/" # Destination folder (in the container) for generated files. Will be created if doesn't exist. ROOT_PROJECT_PATH = pathlib.Path(__file__).parent.parent.parent.resolve() ROOT_GROUND_TRUTH_DIR = os.path.join(ROOT_PROJECT_PATH, "Ground_Truth/") DATABASE_DEFAULT_CONFIG = { - "host": "postgres", # Host machine where the db is located + "host": "localhost", # Host machine where the db is located "user": "postgres", # Database user "password": "postgres", # The password to the database user "port": "5432", # The port number for Postgres From 8481ed8a20206cb2e10251e360f498bff1a095a7 Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Tue, 1 Jul 2025 11:27:22 +0200 Subject: [PATCH 04/33] fixed error in config vals format --- pidsmaker/config/config.py | 170 ++++++++++++++++++++++++++++--------- 1 file changed, 130 insertions(+), 40 deletions(-) diff --git a/pidsmaker/config/config.py b/pidsmaker/config/config.py index 923cbeff..8c356c87 100644 --- a/pidsmaker/config/config.py +++ b/pidsmaker/config/config.py @@ -81,7 +81,11 @@ "E5-CADETS/node_Nginx_Drakon_APT_17.csv", ], "attack_to_time_window": [ - ["E5-CADETS/node_Nginx_Drakon_APT.csv", "2019-05-16 09:31:00", "2019-05-16 10:12:00"], + [ + "E5-CADETS/node_Nginx_Drakon_APT.csv", + "2019-05-16 09:31:00", + "2019-05-16 10:12:00", + ], [ "E5-CADETS/node_Nginx_Drakon_APT_17.csv", "2019-05-17 10:15:00", @@ -117,10 +121,22 @@ "E3-CADETS/node_Nginx_Backdoor_13.csv", ], "attack_to_time_window": [ - ["E3-CADETS/node_Nginx_Backdoor_06.csv", "2018-04-06 11:20:00", "2018-04-06 12:09:00"], + [ + "E3-CADETS/node_Nginx_Backdoor_06.csv", + "2018-04-06 11:20:00", + "2018-04-06 12:09:00", + ], # ["E3-CADETS/node_Nginx_Backdoor_11.csv" , '2018-04-11 15:07:00', '2018-04-11 15:16:00'], - ["E3-CADETS/node_Nginx_Backdoor_12.csv", "2018-04-12 13:59:00", "2018-04-12 14:39:00"], - ["E3-CADETS/node_Nginx_Backdoor_13.csv", "2018-04-13 09:03:00", "2018-04-13 09:16:00"], + [ + "E3-CADETS/node_Nginx_Backdoor_12.csv", + "2018-04-12 13:59:00", + "2018-04-12 14:39:00", + ], + [ + "E3-CADETS/node_Nginx_Backdoor_13.csv", + "2018-04-13 09:03:00", + "2018-04-13 09:16:00", + ], ], }, "CLEARSCOPE_E5": { @@ -131,7 +147,13 @@ "num_edge_types": 10, "year_month": "2019-05", "start_end_day_range": (8, 18), - "train_files": ["graph_8", "graph_9", "graph_10", "graph_11", "graph_12"], + "train_files": [ + "graph_8", + "graph_9", + "graph_10", + "graph_11", + "graph_12", + ], "val_files": ["graph_13"], "test_files": ["graph_14", "graph_15", "graph_17"], "unused_files": ["graph_16"], @@ -209,7 +231,11 @@ "h201/node_h201_0923.csv", ], "attack_to_time_window": [ - ["h201/node_h201_0923.csv", "2019-09-23 11:23:00", "2019-09-23 13:25:00"], + [ + "h201/node_h201_0923.csv", + "2019-09-23 11:23:00", + "2019-09-23 13:25:00", + ], ], }, "optc_h501": { @@ -228,7 +254,11 @@ "h501/node_h501_0924.csv", ], "attack_to_time_window": [ - ["h501/node_h501_0924.csv", "2019-09-24 10:28:00", "2019-09-24 15:29:00"], + [ + "h501/node_h501_0924.csv", + "2019-09-24 10:28:00", + "2019-09-24 15:29:00", + ], ], }, "optc_h051": { @@ -247,7 +277,11 @@ "h051/node_h051_0925.csv", ], "attack_to_time_window": [ - ["h051/node_h051_0925.csv", "2019-09-25 10:29:00", "2019-09-25 14:25:00"], + [ + "h051/node_h051_0925.csv", + "2019-09-25 10:29:00", + "2019-09-25 14:25:00", + ], ], }, } @@ -401,7 +435,8 @@ def __init__(self, type, vals: list = None, desc: str = None): output size matching the downstream objective (e.g. edge type prediction involves predicting 10 edge types, so the output of the decoder should be 10).", ), "src_dst_projection_coef": Arg( - int, desc="Multiplier of input neurons to project src and dst nodes." + int, + desc="Multiplier of input neurons to project src and dst nodes.", ), }, "node_mlp": { @@ -437,7 +472,9 @@ def __init__(self, type, vals: list = None, desc: str = None): # Prediction-based "predict_edge_type": { "decoder": Arg( - str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss." + str, + vals=OR(list(DECODERS_CFG.keys())), + desc="Decoder used before computing loss.", ), **DECODERS_CFG, "balanced_loss": Arg(bool), @@ -445,7 +482,9 @@ def __init__(self, type, vals: list = None, desc: str = None): }, "predict_node_type": { "decoder": Arg( - str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss." + str, + vals=OR(list(DECODERS_CFG.keys())), + desc="Decoder used before computing loss.", ), **DECODERS_CFG, "balanced_loss": Arg(bool), @@ -453,20 +492,26 @@ def __init__(self, type, vals: list = None, desc: str = None): "predict_masked_struct": { "loss": Arg(str, vals=OR(PRED_LOSSES)), "decoder": Arg( - str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss." + str, + vals=OR(list(DECODERS_CFG.keys())), + desc="Decoder used before computing loss.", ), **DECODERS_CFG, "balanced_loss": Arg(bool), }, "detect_edge_few_shot": { "decoder": Arg( - str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss." + str, + vals=OR(list(DECODERS_CFG.keys())), + desc="Decoder used before computing loss.", ), **DECODERS_CFG, }, "predict_edge_contrastive": { "decoder": Arg( - str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss." + str, + vals=OR(list(DECODERS_CFG.keys())), + desc="Decoder used before computing loss.", ), **DECODERS_CFG, "inner_product": { @@ -477,21 +522,27 @@ def __init__(self, type, vals: list = None, desc: str = None): "reconstruct_node_features": { "loss": Arg(str, vals=OR(RECON_LOSSES)), "decoder": Arg( - str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss." + str, + vals=OR(list(DECODERS_CFG.keys())), + desc="Decoder used before computing loss.", ), **DECODERS_CFG, }, "reconstruct_node_embeddings": { "loss": Arg(str, vals=OR(RECON_LOSSES)), "decoder": Arg( - str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss." + str, + vals=OR(list(DECODERS_CFG.keys())), + desc="Decoder used before computing loss.", ), **DECODERS_CFG, }, "reconstruct_edge_embeddings": { "loss": Arg(str, vals=OR(RECON_LOSSES)), "decoder": Arg( - str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss." + str, + vals=OR(list(DECODERS_CFG.keys())), + desc="Decoder used before computing loss.", ), **DECODERS_CFG, }, @@ -499,7 +550,9 @@ def __init__(self, type, vals: list = None, desc: str = None): "loss": Arg(str, vals=OR(RECON_LOSSES)), "mask_rate": Arg(float), "decoder": Arg( - str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss." + str, + vals=OR(list(DECODERS_CFG.keys())), + desc="Decoder used before computing loss.", ), **DECODERS_CFG, }, @@ -514,14 +567,23 @@ def __init__(self, type, vals: list = None, desc: str = None): }, } -THRESHOLD_METHODS = ["max_val_loss", "mean_val_loss", "threatrace", "magic", "flash", "nodlink"] +THRESHOLD_METHODS = [ + "max_val_loss", + "mean_val_loss", + "threatrace", + "magic", + "flash", + "nodlink", +] # --- Tasks, subtasks, and argument configurations --- TASK_ARGS = { "preprocessing": { "build_graphs": { "used_method": Arg( - str, vals=OR(["default", "magic"]), desc="The method to build time window graphs." + str, + vals=OR(["default", "magic"]), + desc="The method to build time window graphs.", ), "use_all_files": Arg(bool), "mimicry_edge_num": Arg(int), @@ -531,7 +593,8 @@ def __init__(self, type, vals: list = None, desc: str = None): ), "use_hashed_label": Arg(bool, desc="Whether to hash the textual features."), "fuse_edge": Arg( - bool, desc="Whether to fuse duplicate sequential edges into a single edge." + bool, + desc="Whether to fuse duplicate sequential edges into a single edge.", ), "node_label_features": { "subject": Arg( @@ -541,17 +604,15 @@ def __init__(self, type, vals: list = None, desc: str = None): ), "file": Arg( str, - vals=AND( - ["type", "path"], - desc="Which features use for file nodes. Features will be concatenated.", - ), + vals=AND(["type", "path"]), + desc="Which features use for file nodes. Features will be concatenated.", ), "netflow": Arg( str, vals=AND( ["type", "remote_ip", "remote_port"], - desc="Which features use for netflow nodes. Features will be concatenated.", ), + desc="Which features use for netflow nodes. Features will be concatenated.", ), }, "multi_dataset": Arg( @@ -582,7 +643,8 @@ def __init__(self, type, vals: list = None, desc: str = None): desc="Size of the text embedding. Arg not used by some featurization methods that do not build embeddings.", ), "epochs": Arg( - int, desc="Epochs to train the embedding method. Arg not used by some methods." + int, + desc="Epochs to train the embedding method. Arg not used by some methods.", ), "use_seed": Arg(bool), "training_split": Arg( @@ -621,13 +683,22 @@ def __init__(self, type, vals: list = None, desc: str = None): ), "edge_features": Arg( str, - vals=AND(["edge_type", "edge_type_triplet", "msg", "time_encoding", "none"]), + vals=AND( + [ + "edge_type", + "edge_type_triplet", + "msg", + "time_encoding", + "none", + ] + ), desc="Edge features to used during GNN training. `edge_type` refers to the system call type, `edge_type_triplet` \ considers a same edge type as a new type if source or destination node types are different, `msg` is the message vector \ used in the TGN, `time_encoding` encodes temporal order of events with their timestamps in the TGN, `none` uses no features.", ), "multi_dataset_training": Arg( - bool, desc="Whether the GNN should be trained on all datasets in `multi_dataset`." + bool, + desc="Whether the GNN should be trained on all datasets in `multi_dataset`.", ), "fix_buggy_graph_reindexer": Arg( bool, @@ -670,7 +741,8 @@ def __init__(self, type, vals: list = None, desc: str = None): }, "tgn_last_neighbor": { "tgn_neighbor_size": Arg( - int, desc="Number of last neighbors to store for each node." + int, + desc="Number of last neighbors to store for each node.", ), "tgn_neighbor_n_hop": Arg( int, @@ -711,22 +783,29 @@ def __init__(self, type, vals: list = None, desc: str = None): }, "gnn_training": { "use_seed": Arg(bool), - "deterministic": Arg(bool, desc="Whether to force PyTorch to use deterministic algorithms."), + "deterministic": Arg( + bool, + desc="Whether to force PyTorch to use deterministic algorithms.", + ), "num_epochs": Arg(int), "patience": Arg(int), "lr": Arg(float), "weight_decay": Arg(float), - "node_hid_dim": Arg(int, desc="Number of neurons in the middle layers of the encoder."), + "node_hid_dim": Arg( + int, + desc="Number of neurons in the middle layers of the encoder.", + ), "node_out_dim": Arg(int, desc="Number of neurons in the last layer of the encoder."), "grad_accumulation": Arg( - int, desc="Number of epochs to gather gradients before backprop." + int, + desc="Number of epochs to gather gradients before backprop.", ), "inference_device": Arg( - str, vals=OR(["cpu", "cuda"]), desc="Device used during testing." - ), - "used_method": Arg( - str, vals=OR(["default"]), desc="Which training pipeline use." + str, + vals=OR(["cpu", "cuda"]), + desc="Device used during testing.", ), + "used_method": Arg(str, vals=OR(["default"]), desc="Which training pipeline use."), "encoder": { "dropout": Arg(float), "used_methods": Arg( @@ -782,10 +861,12 @@ def __init__(self, type, vals: list = None, desc: str = None): desc="Whether to consider the loss of destination nodes when computing the node-level scores (maximum loss of a node).", ), "use_kmeans": Arg( - bool, desc="Whether to cluster nodes after thresholding as done in Orthrus" + bool, + desc="Whether to cluster nodes after thresholding as done in Orthrus", ), "kmeans_top_K": Arg( - int, desc="Number of top-score nodes selected before clustering." + int, + desc="Number of top-score nodes selected before clustering.", ), }, "tw_evaluation": { @@ -841,7 +922,16 @@ def __init__(self, type, vals: list = None, desc: str = None): ), "depimpact": { "used_method": Arg( - str, vals=OR(["component", "shortest_path", "1-hop", "2-hop", "3-hop"]) + str, + vals=OR( + [ + "component", + "shortest_path", + "1-hop", + "2-hop", + "3-hop", + ] + ), ), "score_method": Arg(str, vals=OR(["degree", "recon_loss", "degree_recon"])), "workers": Arg(int), From e25789f85dace9337fd07e097fe18edbceb0ce3f Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Tue, 1 Jul 2025 15:30:50 +0200 Subject: [PATCH 05/33] minor formatting changes --- Makefile | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index cc32c1b5..bb8ab71c 100644 --- a/Makefile +++ b/Makefile @@ -52,13 +52,13 @@ full-setup: up load-dumps help: @echo "Available commands:" - @echo " postgres-up - Start PostgreSQL" - @echo " postgres-down - Stop PostgreSQL" - @echo " postgres-status - Check PostgreSQL status" - @echo " postgres-logs - Show PostgreSQL logs" - @echo " postgres-load-dumps - Load database dumps" - @echo " postgres-full-setup - Start PostgreSQL and load dumps" - @echo " postgres-clean - Stop and remove all data" - @echo " postgres-reset - Clean and restart PostgreSQL" - @echo " app-build - Build PIDSMaker container" - @echo " app-run - Run PIDSMaker with PostgreSQL" + @echo " up - Start PostgreSQL" + @echo " down - Stop PostgreSQL" + @echo " status - Check PostgreSQL status" + @echo " logs - Show PostgreSQL logs" + @echo " load-dumps - Load database dumps" + @echo " full-setup - Start PostgreSQL and load dumps" + @echo " clean - Stop and remove all data" + @echo " reset - Clean and restart PostgreSQL" + @echo " app-build - Build PIDSMaker container" + @echo " app-run - Run PIDSMaker with PostgreSQL" From e6e07ff7af07c3c06a4e8099552607c1b39c0dab Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Wed, 23 Jul 2025 10:38:10 +0200 Subject: [PATCH 06/33] throw errors instead of continuing without any warning --- .../evaluation_methods/evaluation_utils.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/pidsmaker/detection/evaluation_methods/evaluation_utils.py b/pidsmaker/detection/evaluation_methods/evaluation_utils.py index af055268..a83522ef 100644 --- a/pidsmaker/detection/evaluation_methods/evaluation_utils.py +++ b/pidsmaker/detection/evaluation_methods/evaluation_utils.py @@ -36,11 +36,9 @@ def classifier_evaluation(y_test, y_test_pred, scores): - labels_exist = sum(y_test) > 0 - if labels_exist: - tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel() - else: - tn, fp, fn, tp = 1, 1, 1, 1 # only to not break tests + if not sum(y_test) > 0: + raise ValueError("Cannot evaluate: no positive labels in test set") + tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel() eps = 1e-12 fpr = fp / (fp + tn + eps) @@ -51,15 +49,18 @@ def classifier_evaluation(y_test, y_test_pred, scores): try: auc_val = roc_auc_score(y_test, scores) - except: + except ValueError as e: + log(f"WARNING: AUC calculation failed: {e}") auc_val = float("nan") try: ap = ap_score(y_test, scores) - except: + except ValueError as e: + log(f"WARNING: AP calculation failed: {e}") ap = float("nan") try: balanced_acc = balanced_accuracy_score(y_test, y_test_pred) - except: + except ValueError as e: + log(f"WARNING: Balanced ACC calculation failed: {e}") balanced_acc = float("nan") sensitivity = tp / (tp + fn + eps) From fd11e5a4bd68f69ae3b8901da4b1d79da720ae88 Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Wed, 23 Jul 2025 10:39:26 +0200 Subject: [PATCH 07/33] fixed 'stats["percent_detected_attacks"]' calculation --- .../evaluation_methods/node_evaluation.py | 135 +++++++++++++----- 1 file changed, 103 insertions(+), 32 deletions(-) diff --git a/pidsmaker/detection/evaluation_methods/node_evaluation.py b/pidsmaker/detection/evaluation_methods/node_evaluation.py index 665d5e03..a8b85702 100644 --- a/pidsmaker/detection/evaluation_methods/node_evaluation.py +++ b/pidsmaker/detection/evaluation_methods/node_evaluation.py @@ -37,7 +37,7 @@ def get_node_predictions(val_tw_path, test_tw_path, cfg, **kwargs): log(f"Loading data from {test_tw_path}...") threshold_method = cfg.detection.evaluation.node_evaluation.threshold_method - if threshold_method == "magic": + if threshold_method == "magic": # data leaking by using test data thr = get_threshold(test_tw_path, threshold_method) else: thr = get_threshold(val_tw_path, threshold_method) @@ -48,7 +48,9 @@ def get_node_predictions(val_tw_path, test_tw_path, cfg, **kwargs): node_to_max_loss = defaultdict(int) filelist = listdir_sorted(test_tw_path) - for tw, file in enumerate(log_tqdm(sorted(filelist), desc="Compute labels")): + for tw, file in enumerate( + log_tqdm(sorted(filelist), desc="Compute labels") + ): file = os.path.join(test_tw_path, file) df = pd.read_csv(file).to_dict(orient="records") for line in df: @@ -72,7 +74,9 @@ def get_node_predictions(val_tw_path, test_tw_path, cfg, **kwargs): # For plotting the scores of seen and unseen nodes graph_dir = cfg.preprocessing.transformation._graphs_dir - train_set_paths = get_all_files_from_folders(graph_dir, cfg.dataset.train_files) + train_set_paths = get_all_files_from_folders( + graph_dir, cfg.dataset.train_files + ) train_node_set = set() for train_path in train_set_paths: @@ -87,7 +91,9 @@ def get_node_predictions(val_tw_path, test_tw_path, cfg, **kwargs): ) results[node_id]["score"] = pred_score - results[node_id]["tw_with_max_loss"] = node_to_max_loss_tw.get(node_id, -1) + results[node_id]["tw_with_max_loss"] = node_to_max_loss_tw.get( + node_id, -1 + ) results[node_id]["y_true"] = int(node_id in ground_truth_nids) results[node_id]["is_seen"] = int(str(node_id) in train_node_set) @@ -98,7 +104,8 @@ def get_node_predictions(val_tw_path, test_tw_path, cfg, **kwargs): if use_kmeans: results = compute_kmeans_labels( - results, topk_K=cfg.detection.evaluation.node_evaluation.kmeans_top_K + results, + topk_K=cfg.detection.evaluation.node_evaluation.kmeans_top_K, ) return results, thr @@ -119,7 +126,9 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs): node_to_max_loss = defaultdict(int) filelist = listdir_sorted(test_tw_path) - for tw, file in enumerate(log_tqdm(sorted(filelist), desc="Compute labels")): + for tw, file in enumerate( + log_tqdm(sorted(filelist), desc="Compute labels") + ): file = os.path.join(test_tw_path, file) df = pd.read_csv(file).to_dict(orient="records") for line in df: @@ -130,9 +139,13 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs): node_to_values[node]["tw"].append(tw) if "threatrace_score" in line: - node_to_values[node]["threatrace_score"].append(line["threatrace_score"]) + node_to_values[node]["threatrace_score"].append( + line["threatrace_score"] + ) if "correct_pred" in line: - node_to_values[node]["correct_pred"].append(line["correct_pred"]) + node_to_values[node]["correct_pred"].append( + line["correct_pred"] + ) if "flash_score" in line: node_to_values[node]["flash_score"].append(line["flash_score"]) if "magic_score" in line: @@ -144,7 +157,9 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs): # For plotting the scores of seen and unseen nodes graph_dir = cfg.preprocessing.transformation._graphs_dir - train_set_paths = get_all_files_from_folders(graph_dir, cfg.dataset.train_files) + train_set_paths = get_all_files_from_folders( + graph_dir, cfg.dataset.train_files + ) train_node_set = set() for train_path in train_set_paths: @@ -157,7 +172,10 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs): threatrace_label = 0 flash_label = 0 detected_tw = None - if cfg.detection.evaluation.node_evaluation.threshold_method == "threatrace": + if ( + cfg.detection.evaluation.node_evaluation.threshold_method + == "threatrace" + ): max_score = 0 pred_score = max(losses["threatrace_score"]) @@ -169,7 +187,9 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs): max_score = score detected_tw = tw - elif cfg.detection.evaluation.node_evaluation.threshold_method == "flash": + elif ( + cfg.detection.evaluation.node_evaluation.threshold_method == "flash" + ): max_score = 0 pred_score = max(losses["flash_score"]) @@ -181,7 +201,9 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs): max_score = score detected_tw = tw - elif cfg.detection.evaluation.node_evaluation.threshold_method == "magic": + elif ( + cfg.detection.evaluation.node_evaluation.threshold_method == "magic" + ): max_score = 0 pred_score = max(losses["magic_score"]) @@ -193,11 +215,14 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs): else: pred_score = reduce_losses_to_score( - losses["loss"], cfg.detection.evaluation.node_evaluation.threshold_method + losses["loss"], + cfg.detection.evaluation.node_evaluation.threshold_method, ) results[node_id]["score"] = pred_score - results[node_id]["tw_with_max_loss"] = node_to_max_loss_tw.get(node_id, -1) + results[node_id]["tw_with_max_loss"] = node_to_max_loss_tw.get( + node_id, -1 + ) results[node_id]["y_true"] = int(node_id in ground_truth_nids) results[node_id]["is_seen"] = int(str(node_id) in train_node_set) @@ -205,7 +230,8 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs): detected_tw = detected_tw or node_to_max_loss_tw.get(node_id, None) if detected_tw is not None: results[node_id]["time_range"] = [ - datetime_to_ns_time_US_handle_nano(tw) for tw in filelist[detected_tw].split("~") + datetime_to_ns_time_US_handle_nano(tw) + for tw in filelist[detected_tw].split("~") ] else: results[node_id]["time_range"] = None @@ -213,16 +239,23 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs): if use_kmeans: # in this mode, we add the label after results[node_id]["y_hat"] = 0 else: - if cfg.detection.evaluation.node_evaluation.threshold_method == "threatrace": + if ( + cfg.detection.evaluation.node_evaluation.threshold_method + == "threatrace" + ): results[node_id]["y_hat"] = threatrace_label - elif cfg.detection.evaluation.node_evaluation.threshold_method == "flash": + elif ( + cfg.detection.evaluation.node_evaluation.threshold_method + == "flash" + ): results[node_id]["y_hat"] = flash_label else: results[node_id]["y_hat"] = int(pred_score > thr) if use_kmeans: results = compute_kmeans_labels( - results, topk_K=cfg.detection.evaluation.node_evaluation.kmeans_top_K + results, + topk_K=cfg.detection.evaluation.node_evaluation.kmeans_top_K, ) return results, thr @@ -230,7 +263,11 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs): def analyze_false_positives( y_truth, y_preds, pred_scores, max_val_loss_tw, nodes, tw_to_malicious_nodes ): - fp_indices = [i for i, (true, pred) in enumerate(zip(y_truth, y_preds)) if pred and not true] + fp_indices = [ + i + for i, (true, pred) in enumerate(zip(y_truth, y_preds)) + if pred and not true + ] malicious_tws = set(tw_to_malicious_nodes.keys()) num_fps_in_malicious_tw = 0 @@ -239,22 +276,35 @@ def analyze_false_positives( num_fps_in_malicious_tw += int(is_in_malicious_tw) fp_in_malicious_tw_ratio = ( - num_fps_in_malicious_tw / len(fp_indices) if len(fp_indices) > 0 else float("nan") + num_fps_in_malicious_tw / len(fp_indices) + if len(fp_indices) > 0 + else float("nan") ) return fp_in_malicious_tw_ratio -def main(val_tw_path, test_tw_path, model_epoch_dir, cfg, tw_to_malicious_nodes, **kwargs): +def main( + val_tw_path, + test_tw_path, + model_epoch_dir, + cfg, + tw_to_malicious_nodes, + **kwargs, +): if cfg._is_node_level: get_preds_fn = get_node_predictions_node_level else: get_preds_fn = get_node_predictions - results, thr = get_preds_fn(cfg=cfg, val_tw_path=val_tw_path, test_tw_path=test_tw_path) + results, thr = get_preds_fn( + cfg=cfg, val_tw_path=val_tw_path, test_tw_path=test_tw_path + ) # save results for future checking os.makedirs(cfg.detection.evaluation._results_dir, exist_ok=True) - results_save_dir = os.path.join(cfg.detection.evaluation._results_dir, "results.pth") + results_save_dir = os.path.join( + cfg.detection.evaluation._results_dir, "results.pth" + ) torch.save(results, results_save_dir) log(f"Resutls saved to {results_save_dir}") @@ -268,9 +318,15 @@ def main(val_tw_path, test_tw_path, model_epoch_dir, cfg, tw_to_malicious_nodes, ) # average detection precision scores_img_file = os.path.join(out_dir, f"scores_{model_epoch_dir}.png") # simple_scores_img_file = os.path.join(out_dir, f"simple_scores_{model_epoch_dir}.png") - neat_scores_img_file = os.path.join(out_dir, f"neat_scores_{model_epoch_dir}.svg") - seen_score_img_file = os.path.join(out_dir, f"seen_score_{model_epoch_dir}.png") - discrim_img_file = os.path.join(out_dir, f"discrim_curve_{model_epoch_dir}.png") + neat_scores_img_file = os.path.join( + out_dir, f"neat_scores_{model_epoch_dir}.svg" + ) + seen_score_img_file = os.path.join( + out_dir, f"seen_score_{model_epoch_dir}.png" + ) + discrim_img_file = os.path.join( + out_dir, f"discrim_curve_{model_epoch_dir}.png" + ) attack_to_GPs = get_GP_of_each_attack(cfg) attack_to_TPs = defaultdict(int) @@ -314,9 +370,13 @@ def main(val_tw_path, test_tw_path, model_epoch_dir, cfg, tw_to_malicious_nodes, adp_score = plot_detected_attacks_vs_precision( pred_scores, nodes, node2attacks, y_truth, adp_img_file ) - discrim_scores = compute_discrimination_score(pred_scores, nodes, node2attacks, y_truth) + discrim_scores = compute_discrimination_score( + pred_scores, nodes, node2attacks, y_truth + ) plot_discrimination_metric(pred_scores, y_truth, discrim_img_file) - discrim_tp = compute_discrimination_tp(pred_scores, nodes, node2attacks, y_truth) + discrim_tp = compute_discrimination_tp( + pred_scores, nodes, node2attacks, y_truth + ) # plot_simple_scores(pred_scores, y_truth, simple_scores_img_file) plot_scores_with_paths_node_level( pred_scores, @@ -329,12 +389,19 @@ def main(val_tw_path, test_tw_path, model_epoch_dir, cfg, tw_to_malicious_nodes, cfg, thr, ) - plot_scores_neat(pred_scores, y_truth, nodes, node2attacks, neat_scores_img_file, thr) + plot_scores_neat( + pred_scores, y_truth, nodes, node2attacks, neat_scores_img_file, thr + ) # plot_score_seen(pred_scores, is_seen, seen_score_img_file) stats = classifier_evaluation(y_truth, y_preds, pred_scores) fp_in_malicious_tw_ratio = analyze_false_positives( - y_truth, y_preds, pred_scores, max_val_loss_tw, nodes, tw_to_malicious_nodes + y_truth, + y_preds, + pred_scores, + max_val_loss_tw, + nodes, + tw_to_malicious_nodes, ) stats["fp_in_malicious_tw_ratio"] = round(fp_in_malicious_tw_ratio, 3) @@ -345,7 +412,9 @@ def main(val_tw_path, test_tw_path, model_epoch_dir, cfg, tw_to_malicious_nodes, tps_in_atts.append((att, tps)) stats["percent_detected_attacks"] = ( - round(len(attack_to_GPs) / len(attack_to_TPs), 2) if len(attack_to_TPs) > 0 else 0 + round(len(attack_to_TPs) / len(attack_to_GPs), 2) + if len(attack_to_GPs) > 0 + else 0 ) fps, tps, precision, recall = get_metrics_if_all_attacks_detected( @@ -361,7 +430,9 @@ def main(val_tw_path, test_tw_path, model_epoch_dir, cfg, tw_to_malicious_nodes, for k, v in discrim_scores.items(): stats[k] = round(v, 4) - attack2tps = get_detected_tps_node_level(pred_scores, nodes, node2attacks, y_truth, cfg) + attack2tps = get_detected_tps_node_level( + pred_scores, nodes, node2attacks, y_truth, cfg + ) for attack, detected_tps in attack2tps.items(): stats[f"tps_{attack}"] = str(detected_tps) From 4a88a32c8c3c710c1651fc184800a998870ad153 Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Wed, 23 Jul 2025 10:40:09 +0200 Subject: [PATCH 08/33] window size in ns not in sec --- .../preprocessing/build_graph_methods/build_default_graphs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pidsmaker/preprocessing/build_graph_methods/build_default_graphs.py b/pidsmaker/preprocessing/build_graph_methods/build_default_graphs.py index 06dae428..784d1c42 100644 --- a/pidsmaker/preprocessing/build_graph_methods/build_default_graphs.py +++ b/pidsmaker/preprocessing/build_graph_methods/build_default_graphs.py @@ -262,7 +262,7 @@ def get_batches(arr, batch_size): start_time = events_list[0][-2] temp_list = [] BATCH = 1024 - window_size_in_sec = cfg.preprocessing.build_graphs.time_window_size * 60_000_000_000 + window_size_in_ns = cfg.preprocessing.build_graphs.time_window_size * 60_000_000_000 last_batch = False for batch_edges in get_batches(events_list, BATCH): @@ -272,7 +272,7 @@ def get_batches(arr, batch_size): if (len(batch_edges) < BATCH) or (temp_list[-1] == events_list[-1]): last_batch = True - if (batch_edges[-1][-2] > start_time + window_size_in_sec) or last_batch: + if (batch_edges[-1][-2] > start_time + window_size_in_ns) or last_batch: time_interval = ( ns_time_to_datetime_US(start_time) + "~" From 5f9d7497b121fb03336258c20f28fb8efd60f84c Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Wed, 30 Jul 2025 15:33:12 +0200 Subject: [PATCH 09/33] updated detected_attacks when new attack is detected --- pidsmaker/detection/evaluation_methods/evaluation_utils.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pidsmaker/detection/evaluation_methods/evaluation_utils.py b/pidsmaker/detection/evaluation_methods/evaluation_utils.py index a83522ef..947a0f86 100644 --- a/pidsmaker/detection/evaluation_methods/evaluation_utils.py +++ b/pidsmaker/detection/evaluation_methods/evaluation_utils.py @@ -660,13 +660,12 @@ def plot_detected_attacks_vs_precision(scores, nodes, node2attacks, labels, out_ # Update tp and fp based on label if sorted_labels[i] == 1: tp += 1 + # Update detected attacks set if node has associated attacks + if node in node2attacks: + detected_attacks.update(node2attacks[node]) else: fp += 1 - # Update detected attacks set if node has associated attacks - if node in node2attacks: - detected_attacks.update(node2attacks[node]) - # Calculate precision and detected attacks percentage precision = tp / (tp + fp) detected_attacks_percentage = (len(detected_attacks) / total_attacks) * 100 From a89ad7b2af3e05b927f0d6dc52076892913622f1 Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Mon, 4 Aug 2025 16:03:49 +0200 Subject: [PATCH 10/33] use specific command line arguments to configure the database --- compose-postgres.yml | 2 +- docs/docs/create-db-from-scratch.md | 10 +++++++--- pidsmaker/config/pipeline.py | 30 +++++++++++++++++------------ 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/compose-postgres.yml b/compose-postgres.yml index 5553d5ef..fe074d44 100644 --- a/compose-postgres.yml +++ b/compose-postgres.yml @@ -12,7 +12,7 @@ services: volumes: - postgres_data:/var/lib/postgresql/data - ./postgres/init-create-empty-databases.sh:/docker-entrypoint-initdb.d/init-create-empty-databases.sh - - ./settings/scripts:/scripts + - ./scripts:/scripts - ${INPUT_DIR:-/data}:/data healthcheck: test: ["CMD-SHELL", "pg_isready -U postgres"] diff --git a/docs/docs/create-db-from-scratch.md b/docs/docs/create-db-from-scratch.md index 7bfb6bd3..d17e8da1 100644 --- a/docs/docs/create-db-from-scratch.md +++ b/docs/docs/create-db-from-scratch.md @@ -7,7 +7,7 @@ You can download all required files directly by running: pip install gdown ``` ```shell -./settings/scripts/download_{dataset}.sh {data_folder} +./scripts/download_{dataset}.sh {data_folder} ``` where `{dataset}` can be either `clearscope_e3`, `cadets_e3`, `theia_e3`, `clearscope_e5`, `cadets_e5` or `theia_e5` and `{data_folder}` is the absolute path to the output folder where all raw files will be downloaded. @@ -26,14 +26,18 @@ sudo docker compose exec pids bash 4. Convert the DARPA files ```shell -./settings/scripts/uncompress_darpa_files.sh /data/ +./scripts/uncompress_darpa_files.sh /data/ ``` > [!NOTE] > This may take multiple hours depending on the dataset. ### Optional configurations -- optionally, if using a specific postgres database instead of the postgres docker, update the connection config by setting `DATABASE_DEFAULT_CONFIG` within `pidsmaker/config.py`. +- optionally, if using a specific postgres database instead of the postgres docker, pass the details as command line arguments to the python scripts + - `--database_host`: the host machine where the database is located (default: `postgres`) + - `--database_user`: the database user to connect to the database (default: `postgres`) + - `--database_password`: the password for the database user (default: `postgres`) + - `--database_port`: the port number for Postgres (default: `5432`) - optionaly, if you want to change the output folder where generated files are stored, update accordingly the volume by uncommenting `./artifacts:/home/artifacts` in `compose.yml`. diff --git a/pidsmaker/config/pipeline.py b/pidsmaker/config/pipeline.py index f51aafa3..6e646c0f 100644 --- a/pidsmaker/config/pipeline.py +++ b/pidsmaker/config/pipeline.py @@ -24,17 +24,9 @@ Arg, ) -DEFAULT_ROOT_ARTIFACT_DIR = "./artifacts/" # Destination folder (in the container) for generated files. Will be created if doesn't exist. ROOT_PROJECT_PATH = pathlib.Path(__file__).parent.parent.parent.resolve() ROOT_GROUND_TRUTH_DIR = os.path.join(ROOT_PROJECT_PATH, "Ground_Truth/") - -DATABASE_DEFAULT_CONFIG = { - "host": "localhost", # Host machine where the db is located - "user": "postgres", # Database user - "password": "postgres", # The password to the database user - "port": "5432", # The port number for Postgres -} # ================================================================================ @@ -43,7 +35,7 @@ def get_default_cfg(args): Inits the shared cfg object with default configurations. """ cfg = CN() - cfg._artifact_dir = args.artifact_dir_in_container or DEFAULT_ROOT_ARTIFACT_DIR + cfg._artifact_dir = args.artifact_dir cfg._test_mode = args.test_mode cfg._debug = not args.wandb @@ -64,8 +56,10 @@ def get_default_cfg(args): # Database: we simply create variables for all configurations described in the dict cfg.database = CN() - for attr, value in DATABASE_DEFAULT_CONFIG.items(): - setattr(cfg.database, attr, value) + cfg.database.host = args.database_host + cfg.database.user = args.database_user + cfg.database.password = args.database_password + cfg.database.port = args.database_port # Dataset: we simply create variables for all configurations described in the dict set_dataset_cfg(cfg, args.dataset) @@ -139,9 +133,21 @@ def get_runtime_required_args(return_unknown_args=False, args=None): parser.add_argument( "--tuning_file_path", default="", help="If set, use the given YML path for tuning" ) + parser.add_argument( + "--database_host", default="postgres", help="Host machine where the db is located" + ) + parser.add_argument( + "--database_user", default="postgres", help="Database user to connect to the database" + ) + parser.add_argument( + "--database_password", default="postgres", help="The password to the database user" + ) + parser.add_argument( + "--database_port", default="5432", help="The port number for Postgres (default: 5432)" + ) parser.add_argument("--sweep_id", default="", help="ID of a wandb sweep for multi-agent runs") parser.add_argument( - "--artifact_dir_in_container", default="", help="ID of a wandb sweep for multi-agent runs" + "--artifact_dir", default="./artifacts/", help="Destination folder for generated files" ) parser.add_argument( "--test_mode", From 0ad5a09c746d1a420ec13692c5c57188eab9d756 Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Mon, 4 Aug 2025 16:08:17 +0200 Subject: [PATCH 11/33] addition of scripts to setup a Singularity postgres container for cluster environments. Moving all scripts to the ./scripts/ folder and deletion of ./settings/scripts/ folder --- Makefile => scripts/Makefile | 4 ++-- {settings/scripts => scripts}/create_database.sh | 0 {settings/scripts => scripts}/download_cadets_e3.sh | 0 {settings/scripts => scripts}/download_cadets_e5.sh | 0 {settings/scripts => scripts}/download_clearscope_e3.sh | 0 {settings/scripts => scripts}/download_clearscope_e5.sh | 0 {settings/scripts => scripts}/download_theia_e3.sh | 0 {settings/scripts => scripts}/download_theia_e5.sh | 0 {settings/scripts => scripts}/e3_tools.sh | 0 {settings/scripts => scripts}/load_dumps.sh | 0 postgres-start.sh => scripts/postgres-start.sh | 6 +++--- postgres-status.sh => scripts/postgres-status.sh | 0 postgres-stop.sh => scripts/postgres-stop.sh | 0 {settings/scripts => scripts}/uncompress_darpa_files.sh | 0 14 files changed, 5 insertions(+), 5 deletions(-) rename Makefile => scripts/Makefile (93%) rename {settings/scripts => scripts}/create_database.sh (100%) rename {settings/scripts => scripts}/download_cadets_e3.sh (100%) rename {settings/scripts => scripts}/download_cadets_e5.sh (100%) rename {settings/scripts => scripts}/download_clearscope_e3.sh (100%) rename {settings/scripts => scripts}/download_clearscope_e5.sh (100%) rename {settings/scripts => scripts}/download_theia_e3.sh (100%) rename {settings/scripts => scripts}/download_theia_e5.sh (100%) rename {settings/scripts => scripts}/e3_tools.sh (100%) rename {settings/scripts => scripts}/load_dumps.sh (100%) rename postgres-start.sh => scripts/postgres-start.sh (95%) rename postgres-status.sh => scripts/postgres-status.sh (100%) rename postgres-stop.sh => scripts/postgres-stop.sh (100%) rename {settings/scripts => scripts}/uncompress_darpa_files.sh (100%) diff --git a/Makefile b/scripts/Makefile similarity index 93% rename from Makefile rename to scripts/Makefile index bb8ab71c..b395519b 100644 --- a/Makefile +++ b/scripts/Makefile @@ -39,11 +39,11 @@ app-run: up load-dumps: up @echo "Loading database dumps from inside container..." - @if [ -f "./settings/scripts/load_dumps.sh" ]; then \ + @if [ -f "./load_dumps.sh" ]; then \ echo "Found load_dumps.sh, executing inside container..."; \ singularity exec instance://postgres_instance /scripts/load_dumps.sh; \ else \ - echo "Error: ./settings/scripts/load_dumps.sh not found"; \ + echo "Error: ./load_dumps.sh not found"; \ exit 1; \ fi diff --git a/settings/scripts/create_database.sh b/scripts/create_database.sh similarity index 100% rename from settings/scripts/create_database.sh rename to scripts/create_database.sh diff --git a/settings/scripts/download_cadets_e3.sh b/scripts/download_cadets_e3.sh similarity index 100% rename from settings/scripts/download_cadets_e3.sh rename to scripts/download_cadets_e3.sh diff --git a/settings/scripts/download_cadets_e5.sh b/scripts/download_cadets_e5.sh similarity index 100% rename from settings/scripts/download_cadets_e5.sh rename to scripts/download_cadets_e5.sh diff --git a/settings/scripts/download_clearscope_e3.sh b/scripts/download_clearscope_e3.sh similarity index 100% rename from settings/scripts/download_clearscope_e3.sh rename to scripts/download_clearscope_e3.sh diff --git a/settings/scripts/download_clearscope_e5.sh b/scripts/download_clearscope_e5.sh similarity index 100% rename from settings/scripts/download_clearscope_e5.sh rename to scripts/download_clearscope_e5.sh diff --git a/settings/scripts/download_theia_e3.sh b/scripts/download_theia_e3.sh similarity index 100% rename from settings/scripts/download_theia_e3.sh rename to scripts/download_theia_e3.sh diff --git a/settings/scripts/download_theia_e5.sh b/scripts/download_theia_e5.sh similarity index 100% rename from settings/scripts/download_theia_e5.sh rename to scripts/download_theia_e5.sh diff --git a/settings/scripts/e3_tools.sh b/scripts/e3_tools.sh similarity index 100% rename from settings/scripts/e3_tools.sh rename to scripts/e3_tools.sh diff --git a/settings/scripts/load_dumps.sh b/scripts/load_dumps.sh similarity index 100% rename from settings/scripts/load_dumps.sh rename to scripts/load_dumps.sh diff --git a/postgres-start.sh b/scripts/postgres-start.sh similarity index 95% rename from postgres-start.sh rename to scripts/postgres-start.sh index 1a5eb78b..7d00cd8e 100755 --- a/postgres-start.sh +++ b/scripts/postgres-start.sh @@ -74,10 +74,10 @@ else echo -e "${YELLOW}Warning: ./postgres/init-create-empty-databases.sh not found, skipping${NC}" fi -if [ -d "./settings/scripts" ]; then - BIND_MOUNTS="$BIND_MOUNTS --bind ./settings/scripts:/scripts" +if [ -d "./scripts" ]; then + BIND_MOUNTS="$BIND_MOUNTS --bind ./scripts:/scripts" else - echo -e "${YELLOW}Warning: ./settings/scripts directory not found, skipping${NC}" + echo -e "${YELLOW}Warning: ./scripts directory not found, skipping${NC}" fi # Always bind INPUT_DIR diff --git a/postgres-status.sh b/scripts/postgres-status.sh similarity index 100% rename from postgres-status.sh rename to scripts/postgres-status.sh diff --git a/postgres-stop.sh b/scripts/postgres-stop.sh similarity index 100% rename from postgres-stop.sh rename to scripts/postgres-stop.sh diff --git a/settings/scripts/uncompress_darpa_files.sh b/scripts/uncompress_darpa_files.sh similarity index 100% rename from settings/scripts/uncompress_darpa_files.sh rename to scripts/uncompress_darpa_files.sh From 6c211fbd358501a2deedb877a6291bd6b5e27284 Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Mon, 4 Aug 2025 16:44:25 +0200 Subject: [PATCH 12/33] Removal of separate calls to set_seed for each task. The first call sets the seed for the rest of the execution and avoids a seed reset to the default value. The value was previously always set to 0 and no other seeds were allowed by the code. Finally, torch.use_deterministic_algorithms automatically overwrites torch.backends.cudnn.deterministic and torch.manual_seed already calls internally torch.cuca.manual_seed_all --- config/flash.yml | 2 -- config/kairos.yml | 1 - config/magic.yml | 1 - config/nodlink.yml | 2 -- config/orthrus.yml | 2 -- config/rcaid.yml | 2 -- config/threatrace.yml | 1 - config/tuned_components/tuned_alacarte.yml | 1 - config/tuned_components/tuned_doc2vec.yml | 1 - config/tuned_components/tuned_fasttext.yml | 1 - config/tuned_components/tuned_flash.yml | 1 - config/tuned_components/tuned_word2vec.yml | 1 - docs/scripts/args/args_detection.md | 1 - docs/scripts/args/args_featurization.md | 1 - environment.yaml | 1 + pidsmaker/config/config.py | 2 -- pidsmaker/config/pipeline.py | 1 + pidsmaker/detection/graph_preprocessing.py | 3 +- .../training_methods/inference_loop.py | 3 -- .../training_methods/training_loop.py | 4 +-- pidsmaker/featurization/feat_training.py | 4 --- .../feat_training_alacarte.py | 36 ++++++------------- .../feat_training_doc2vec.py | 3 +- .../feat_training_fasttext.py | 4 +-- .../feat_training_trw.py | 4 +-- .../feat_training_word2vec.py | 4 +-- pidsmaker/preprocessing/transformation.py | 2 -- pidsmaker/utils/utils.py | 17 ++++----- scripts/run.sh | 2 +- 29 files changed, 27 insertions(+), 81 deletions(-) diff --git a/config/flash.yml b/config/flash.yml index 42c8e340..303fa479 100644 --- a/config/flash.yml +++ b/config/flash.yml @@ -20,7 +20,6 @@ featurization: epochs: 10 # 300 training_split: train used_method: flash - use_seed: True flash: min_count: 1 workers: 15 @@ -41,7 +40,6 @@ detection: used_method: none gnn_training: used_method: default - use_seed: True deterministic: False num_epochs: 12 patience: 3 diff --git a/config/kairos.yml b/config/kairos.yml index e21386ea..e1131495 100644 --- a/config/kairos.yml +++ b/config/kairos.yml @@ -45,7 +45,6 @@ detection: used_method: none gnn_training: used_method: default - use_seed: True deterministic: False num_epochs: 12 patience: 3 diff --git a/config/magic.yml b/config/magic.yml index d857f350..6b3fc6c2 100644 --- a/config/magic.yml +++ b/config/magic.yml @@ -34,7 +34,6 @@ detection: used_method: none gnn_training: used_method: default - use_seed: True deterministic: False num_epochs: 12 patience: 3 diff --git a/config/nodlink.yml b/config/nodlink.yml index 046ee648..52cacd9d 100644 --- a/config/nodlink.yml +++ b/config/nodlink.yml @@ -18,7 +18,6 @@ featurization: feat_training: epochs: 100 emb_dim: 256 - use_seed: True training_split: train used_method: fasttext fasttext: @@ -45,7 +44,6 @@ detection: used_method: none gnn_training: used_method: default - use_seed: True deterministic: False num_epochs: 12 patience: 3 diff --git a/config/orthrus.yml b/config/orthrus.yml index 22506f89..3b3d045a 100644 --- a/config/orthrus.yml +++ b/config/orthrus.yml @@ -21,7 +21,6 @@ featurization: emb_dim: 128 epochs: 50 training_split: all - use_seed: True used_method: word2vec word2vec: alpha: 0.025 @@ -58,7 +57,6 @@ detection: used_method: none gnn_training: used_method: default - use_seed: True deterministic: False num_epochs: 12 patience: 3 diff --git a/config/rcaid.yml b/config/rcaid.yml index aff722c0..dff311d1 100644 --- a/config/rcaid.yml +++ b/config/rcaid.yml @@ -20,7 +20,6 @@ featurization: feat_training: epochs: 5 emb_dim: 128 - use_seed: True training_split: all used_method: doc2vec doc2vec: @@ -43,7 +42,6 @@ detection: used_method: none gnn_training: used_method: default - use_seed: True deterministic: False num_epochs: 12 patience: 3 diff --git a/config/threatrace.yml b/config/threatrace.yml index 6d064fe0..809267ab 100644 --- a/config/threatrace.yml +++ b/config/threatrace.yml @@ -34,7 +34,6 @@ detection: used_method: none gnn_training: used_method: default - use_seed: True deterministic: False num_epochs: 12 patience: 3 diff --git a/config/tuned_components/tuned_alacarte.yml b/config/tuned_components/tuned_alacarte.yml index 958e1002..2048c55a 100644 --- a/config/tuned_components/tuned_alacarte.yml +++ b/config/tuned_components/tuned_alacarte.yml @@ -2,7 +2,6 @@ featurization: feat_training: epochs: 10 emb_dim: 128 - use_seed: True training_split: train used_method: alacarte alacarte: diff --git a/config/tuned_components/tuned_doc2vec.yml b/config/tuned_components/tuned_doc2vec.yml index 4824ddec..93d4d8dd 100644 --- a/config/tuned_components/tuned_doc2vec.yml +++ b/config/tuned_components/tuned_doc2vec.yml @@ -2,7 +2,6 @@ featurization: feat_training: epochs: 10 emb_dim: 128 - use_seed: True training_split: train used_method: doc2vec doc2vec: diff --git a/config/tuned_components/tuned_fasttext.yml b/config/tuned_components/tuned_fasttext.yml index feb351fe..0e2a1b32 100644 --- a/config/tuned_components/tuned_fasttext.yml +++ b/config/tuned_components/tuned_fasttext.yml @@ -2,7 +2,6 @@ featurization: feat_training: epochs: 10 emb_dim: 256 - use_seed: True training_split: train used_method: fasttext fasttext: diff --git a/config/tuned_components/tuned_flash.yml b/config/tuned_components/tuned_flash.yml index 01d1b57e..7c058644 100644 --- a/config/tuned_components/tuned_flash.yml +++ b/config/tuned_components/tuned_flash.yml @@ -4,7 +4,6 @@ featurization: epochs: 10 training_split: train used_method: flash - use_seed: True flash: min_count: 1 workers: 10 diff --git a/config/tuned_components/tuned_word2vec.yml b/config/tuned_components/tuned_word2vec.yml index 5f38ae1f..b8bb8696 100644 --- a/config/tuned_components/tuned_word2vec.yml +++ b/config/tuned_components/tuned_word2vec.yml @@ -3,7 +3,6 @@ featurization: epochs: 10 emb_dim: 128 training_split: train - use_seed: True used_method: word2vec word2vec: alpha: 0.025 diff --git a/docs/scripts/args/args_detection.md b/docs/scripts/args/args_detection.md index 2f888bd0..676ba1cf 100644 --- a/docs/scripts/args/args_detection.md +++ b/docs/scripts/args/args_detection.md @@ -45,7 +45,6 @@
  • gnn_training
      -
    • use_seed: bool
    • deterministic: bool (19)
    • num_epochs: int
    • patience: int
    • diff --git a/docs/scripts/args/args_featurization.md b/docs/scripts/args/args_featurization.md index c404f12a..5bd8a81c 100644 --- a/docs/scripts/args/args_featurization.md +++ b/docs/scripts/args/args_featurization.md @@ -5,7 +5,6 @@
      • emb_dim: int (1)
      • epochs: int (2)
      • -
      • use_seed: bool
      • training_split: str (3)
      • multi_dataset_training: bool (4)
      • used_method: str (5)
      • diff --git a/environment.yaml b/environment.yaml index 7d0678d1..5563e2ce 100644 --- a/environment.yaml +++ b/environment.yaml @@ -88,6 +88,7 @@ dependencies: - gitdb==4.0.12 - gitpython==3.1.44 - graphviz==0.20.1 + - h5py==3.14.0 - identify==2.6.12 - idna==3.10 - igraph==0.11.5 diff --git a/pidsmaker/config/config.py b/pidsmaker/config/config.py index 8c356c87..aa2f8d1d 100644 --- a/pidsmaker/config/config.py +++ b/pidsmaker/config/config.py @@ -646,7 +646,6 @@ def __init__(self, type, vals: list = None, desc: str = None): int, desc="Epochs to train the embedding method. Arg not used by some methods.", ), - "use_seed": Arg(bool), "training_split": Arg( str, vals=OR(["train", "all"]), @@ -782,7 +781,6 @@ def __init__(self, type, vals: list = None, desc: str = None): }, }, "gnn_training": { - "use_seed": Arg(bool), "deterministic": Arg( bool, desc="Whether to force PyTorch to use deterministic algorithms.", diff --git a/pidsmaker/config/pipeline.py b/pidsmaker/config/pipeline.py index 6e646c0f..4d6291c6 100644 --- a/pidsmaker/config/pipeline.py +++ b/pidsmaker/config/pipeline.py @@ -146,6 +146,7 @@ def get_runtime_required_args(return_unknown_args=False, args=None): "--database_port", default="5432", help="The port number for Postgres (default: 5432)" ) parser.add_argument("--sweep_id", default="", help="ID of a wandb sweep for multi-agent runs") + parser.add_argument("--seed", default=0, help="Random seed for reproducibility") parser.add_argument( "--artifact_dir", default="./artifacts/", help="Destination folder for generated files" ) diff --git a/pidsmaker/detection/graph_preprocessing.py b/pidsmaker/detection/graph_preprocessing.py index 26ba214a..cc31954f 100644 --- a/pidsmaker/detection/graph_preprocessing.py +++ b/pidsmaker/detection/graph_preprocessing.py @@ -3,7 +3,7 @@ import torch from pidsmaker.utils.data_utils import load_all_datasets -from pidsmaker.utils.utils import get_device, log, log_start, set_seed +from pidsmaker.utils.utils import get_device, log, log_start def get_preprocessed_graphs(cfg): @@ -22,7 +22,6 @@ def get_preprocessed_graphs(cfg): def main(cfg): - set_seed(cfg) log_start(__file__) if cfg.detection.graph_preprocessing.save_on_disk: diff --git a/pidsmaker/detection/training_methods/inference_loop.py b/pidsmaker/detection/training_methods/inference_loop.py index 0847a97e..b67142c6 100644 --- a/pidsmaker/detection/training_methods/inference_loop.py +++ b/pidsmaker/detection/training_methods/inference_loop.py @@ -15,7 +15,6 @@ log, log_tqdm, ns_time_to_datetime_US, - set_seed, ) @@ -264,8 +263,6 @@ def test_node_level( def main(cfg, model, val_data, test_data, epoch, split, logging=True): - set_seed(cfg) - if split == "all": splits = [(val_data, "val"), (test_data, "test")] elif split == "val": diff --git a/pidsmaker/detection/training_methods/training_loop.py b/pidsmaker/detection/training_methods/training_loop.py index 06129ea5..8464e1b8 100644 --- a/pidsmaker/detection/training_methods/training_loop.py +++ b/pidsmaker/detection/training_methods/training_loop.py @@ -12,14 +12,12 @@ optimizer_factory, optimizer_few_shot_factory, ) -from pidsmaker.utils.utils import get_device, log, log_start, log_tqdm, set_seed +from pidsmaker.utils.utils import get_device, log, log_start, log_tqdm from . import inference_loop def main(cfg): - set_seed(cfg) - log_start(__file__) device = get_device(cfg) use_cuda = device == torch.device("cuda") diff --git a/pidsmaker/featurization/feat_training.py b/pidsmaker/featurization/feat_training.py index c10374b8..deb145ab 100644 --- a/pidsmaker/featurization/feat_training.py +++ b/pidsmaker/featurization/feat_training.py @@ -1,5 +1,3 @@ -from pidsmaker.utils.utils import set_seed - from .feat_training_methods import ( build_trw, feat_training_alacarte, @@ -13,8 +11,6 @@ def main(cfg): - set_seed(cfg) - method = cfg.featurization.feat_training.used_method.strip() if method == "alacarte": build_random_walks.main(cfg) diff --git a/pidsmaker/featurization/feat_training_methods/feat_training_alacarte.py b/pidsmaker/featurization/feat_training_methods/feat_training_alacarte.py index ad8b4d69..b427b729 100644 --- a/pidsmaker/featurization/feat_training_methods/feat_training_alacarte.py +++ b/pidsmaker/featurization/feat_training_methods/feat_training_alacarte.py @@ -444,8 +444,6 @@ def feat_training_for_one_split( num_workers = cfg.featurization.feat_training.alacarte.num_workers compute_loss = cfg.featurization.feat_training.alacarte.compute_loss add_paths = cfg.featurization.feat_training.alacarte.add_paths - use_seed = cfg.featurization.feat_training.use_seed - SEED = 0 log_dir = out_dir @@ -485,29 +483,17 @@ def feat_training_for_one_split( # Training using Word2Vec if needed # ===-----------------------------------------------------------------------=== if model_input is None: - if use_seed: - model = Word2Vec( - paths, - vector_size=emb_dim, - window=window_size, - min_count=min_count, - sg=use_skip_gram, - workers=num_workers, - epochs=epochs, - compute_loss=compute_loss, - seed=SEED, - ) - else: - model = Word2Vec( - paths, - vector_size=emb_dim, - window=window_size, - min_count=min_count, - sg=use_skip_gram, - workers=num_workers, - epochs=epochs, - compute_loss=compute_loss, - ) + model = Word2Vec( + paths, + vector_size=emb_dim, + window=window_size, + min_count=min_count, + sg=use_skip_gram, + workers=num_workers, + epochs=epochs, + compute_loss=compute_loss, + seed=cfg.seed, + ) else: log("Loading existing model from: {}".format(model_input)) model = Word2Vec.load(model_input) diff --git a/pidsmaker/featurization/feat_training_methods/feat_training_doc2vec.py b/pidsmaker/featurization/feat_training_methods/feat_training_doc2vec.py index 2d0b35c3..3fb67883 100644 --- a/pidsmaker/featurization/feat_training_methods/feat_training_doc2vec.py +++ b/pidsmaker/featurization/feat_training_methods/feat_training_doc2vec.py @@ -18,9 +18,8 @@ def doc2vec( alpha: float, dm: int = 1, ): - SEED = 0 model = Doc2Vec( - vector_size=emb_dim, alpha=alpha, min_count=1, dm=dm, compute_loss=True, seed=SEED + vector_size=emb_dim, alpha=alpha, min_count=1, dm=dm, compute_loss=True, seed=cfg.seed ) model.build_vocab(tagged_data) diff --git a/pidsmaker/featurization/feat_training_methods/feat_training_fasttext.py b/pidsmaker/featurization/feat_training_methods/feat_training_fasttext.py index 07575efa..b368bd98 100644 --- a/pidsmaker/featurization/feat_training_methods/feat_training_fasttext.py +++ b/pidsmaker/featurization/feat_training_methods/feat_training_fasttext.py @@ -23,8 +23,6 @@ def train_fasttext(corpus, cfg): min_count = cfg.featurization.feat_training.fasttext.min_count num_workers = cfg.featurization.feat_training.fasttext.num_workers negative = cfg.featurization.feat_training.fasttext.negative - use_seed = cfg.featurization.feat_training.use_seed - SEED = 0 use_pretrained_fb_model = cfg.featurization.feat_training.fasttext.use_pretrained_fb_model @@ -44,7 +42,7 @@ def train_fasttext(corpus, cfg): alpha=alpha, window=window_size, negative=negative, - seed=SEED, + seed=cfg.seed, ) model.build_vocab(corpus, update=use_pretrained_fb_model) diff --git a/pidsmaker/featurization/feat_training_methods/feat_training_trw.py b/pidsmaker/featurization/feat_training_methods/feat_training_trw.py index d8ba7dc0..e5a45f6f 100644 --- a/pidsmaker/featurization/feat_training_methods/feat_training_trw.py +++ b/pidsmaker/featurization/feat_training_methods/feat_training_trw.py @@ -41,8 +41,6 @@ def train_word2vec(corpus, model_save_path, cfg): epochs = cfg.featurization.feat_training.epochs compute_loss = cfg.featurization.feat_training.temporal_rw.compute_loss negative = cfg.featurization.feat_training.temporal_rw.negative - use_seed = cfg.featurization.feat_training.use_seed - SEED = 0 model = Word2Vec( corpus, @@ -54,7 +52,7 @@ def train_word2vec(corpus, model_save_path, cfg): epochs=1, compute_loss=compute_loss, negative=negative, - seed=SEED, + seed=cfg.seed, ) epoch_loss = model.get_latest_training_loss() diff --git a/pidsmaker/featurization/feat_training_methods/feat_training_word2vec.py b/pidsmaker/featurization/feat_training_methods/feat_training_word2vec.py index 42216b79..08c7a751 100644 --- a/pidsmaker/featurization/feat_training_methods/feat_training_word2vec.py +++ b/pidsmaker/featurization/feat_training_methods/feat_training_word2vec.py @@ -16,8 +16,6 @@ def train_word2vec(corpus, cfg, model_save_path): epochs = cfg.featurization.feat_training.epochs compute_loss = cfg.featurization.feat_training.word2vec.compute_loss negative = cfg.featurization.feat_training.word2vec.negative - use_seed = cfg.featurization.feat_training.use_seed - SEED = 0 model = Word2Vec( corpus, @@ -30,7 +28,7 @@ def train_word2vec(corpus, cfg, model_save_path): epochs=1, compute_loss=compute_loss, negative=negative, - seed=SEED, + seed=cfg.seed, ) epoch_loss = model.get_latest_training_loss() diff --git a/pidsmaker/preprocessing/transformation.py b/pidsmaker/preprocessing/transformation.py index a6d7d43d..ccf2bb7d 100644 --- a/pidsmaker/preprocessing/transformation.py +++ b/pidsmaker/preprocessing/transformation.py @@ -16,7 +16,6 @@ load_graphs_for_days, log_start, log_tqdm, - set_seed, ) @@ -110,7 +109,6 @@ def main_from_config(cfg): def main(cfg): - set_seed(cfg) log_start(__file__) multi_datasets = get_multi_datasets(cfg) diff --git a/pidsmaker/utils/utils.py b/pidsmaker/utils/utils.py index 731b90ce..8932f4a7 100644 --- a/pidsmaker/utils/utils.py +++ b/pidsmaker/utils/utils.py @@ -652,16 +652,13 @@ def log_helper(label, dataset): def set_seed(cfg): - if cfg.detection.gnn_training.use_seed: - seed = 0 - random.seed(seed) - np.random.seed(seed) - - torch.manual_seed(seed) - torch.cuda.manual_seed_all(seed) - torch.backends.cudnn.deterministic = True - torch.backends.cudnn.benchmark = False - + seed = cfg.seed + os.environ["PYTHONHASHSEED"] = str(seed) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.backends.cudnn.benchmark = False + if cfg.detection.gnn_training.deterministic: torch.use_deterministic_algorithms(True, warn_only=True) diff --git a/scripts/run.sh b/scripts/run.sh index a9a8c5af..4d2ddc55 100755 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -15,4 +15,4 @@ for arg in "$@"; do done # Execute the Python script with the passed arguments -PYTHONHASHSEED=0 nohup python ../pidsmaker/main.py $args --wandb & +nohup python ../pidsmaker/main.py $args --wandb & From f478cc02e51aa673363f770a80a97877f5349213 Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Mon, 4 Aug 2025 17:04:44 +0200 Subject: [PATCH 13/33] Minor fixes --- .gitignore | 1 + pidsmaker/config/pipeline.py | 5 +++-- pidsmaker/main.py | 13 ++++--------- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index a84235d0..86a7b927 100644 --- a/.gitignore +++ b/.gitignore @@ -179,3 +179,4 @@ postgres_lock/ # Postgres directories postgres_config/ postgres_run/ +postgres_log/ diff --git a/pidsmaker/config/pipeline.py b/pidsmaker/config/pipeline.py index 4d6291c6..7dffc84c 100644 --- a/pidsmaker/config/pipeline.py +++ b/pidsmaker/config/pipeline.py @@ -106,7 +106,7 @@ def get_runtime_required_args(return_unknown_args=False, args=None): ) parser.add_argument("--wandb", action="store_true", help="Whether to submit logs to wandb") parser.add_argument( - "--project", type=str, default="", help="Name of the wandb project (optional)" + "--project", type=str, default="PIDSMaker", help="Name of the wandb project" ) parser.add_argument("--exp", type=str, default="", help="Name of the experiment") parser.add_argument( @@ -457,7 +457,8 @@ def get_yml_cfg(args): # Inits with default configurations cfg = get_default_cfg(args) - + # Set seed for reproducibility + cfg.seed = args.seed # Checks that all configurations are valid and merge yml file to cfg yml_file = get_yml_file(args.model) merge_cfg_and_check_syntax(cfg, yml_file) diff --git a/pidsmaker/main.py b/pidsmaker/main.py index 561c59c3..4e961710 100644 --- a/pidsmaker/main.py +++ b/pidsmaker/main.py @@ -276,25 +276,20 @@ def run_pipeline_from_sweep(cfg): ) tags = args.tags.split(",") if args.tags != "" else [args.model] - if args.project != "": - project = args.project - else: - project = "PIDSMaker" + cfg = get_yml_cfg(args) wandb.init( mode=("online" if (args.wandb and args.tuning_mode == "none") else "disabled"), - project=project, + project=args.project, name=exp_name, tags=tags, + config=clean_cfg_for_log(cfg), ) if len(unknown_args) > 0: raise argparse.ArgumentTypeError(f"Unknown args {unknown_args}") - cfg = get_yml_cfg(args) - wandb.config.update(clean_cfg_for_log(cfg)) - - main(cfg, project=project, exp=exp_name, sweep_id=args.sweep_id) + main(cfg, project=args.project, exp=exp_name, sweep_id=args.sweep_id) wandb.finish() From 8ee45ed69c00c341b9d59f7412ed8a87d6371e8f Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Mon, 4 Aug 2025 17:26:10 +0200 Subject: [PATCH 14/33] added README for running using Singularity --- docs/docs/docs/docs/introduction.md | 0 docs/docs/singularity_install.md | 39 ++++++++++++++++++++ environment.yaml => scripts/environment.yaml | 0 scripts/load_dumps.sh | 16 +++----- 4 files changed, 44 insertions(+), 11 deletions(-) create mode 100644 docs/docs/docs/docs/introduction.md create mode 100644 docs/docs/singularity_install.md rename environment.yaml => scripts/environment.yaml (100%) diff --git a/docs/docs/docs/docs/introduction.md b/docs/docs/docs/docs/introduction.md new file mode 100644 index 00000000..e69de29b diff --git a/docs/docs/singularity_install.md b/docs/docs/singularity_install.md new file mode 100644 index 00000000..c54fb620 --- /dev/null +++ b/docs/docs/singularity_install.md @@ -0,0 +1,39 @@ +# Install Framework using Singularity + +For quick installation on environments where Docker is not available (such as HPC clusters), you can use Singularity. This guide assumes Singularity is already installed on your system. + +## Setup Process + +### 1. Database Setup +The Makefile in `./scripts/Makefile` provides easy environment setup: + +```bash +make full-setup +``` + +This command will: +- Download and run a PostgreSQL container through Singularity +- Load database dumps by executing the `load_dumps.sh` script + +### 2. Container Management +Once the database is ready: +- Stop the container: `make down` +- Start it again: `make up` + +### 3. Dependencies Installation +Install all required dependencies using conda: + +```bash +conda env create -f ./scripts/environment.yml +conda activate pids +``` + +## Running the Framework + +Once both the database and conda environment are ready, run the framework with: + +```bash +python pidsmaker/main.py SYSTEM DATASET +``` + +For more details, see the [introduction](introduction.md). \ No newline at end of file diff --git a/environment.yaml b/scripts/environment.yaml similarity index 100% rename from environment.yaml rename to scripts/environment.yaml diff --git a/scripts/load_dumps.sh b/scripts/load_dumps.sh index b476e228..da642ee7 100755 --- a/scripts/load_dumps.sh +++ b/scripts/load_dumps.sh @@ -1,11 +1,10 @@ #!/bin/bash -set -e # Exit on any error +set -e echo "Starting database dump restoration..." for dump_file in /data/*.dump; do - # Check if any dump files exist if [ ! -f "$dump_file" ]; then echo "No .dump files found in /data/ directory" break @@ -15,21 +14,18 @@ for dump_file in /data/*.dump; do echo "Processing $dump_file -> database '$db_name'" - # Check if database already exists and has data if psql -U postgres -h localhost -p 5432 -lqt | cut -d \| -f 1 | grep -qw "$db_name"; then echo "Database '$db_name' already exists. Checking if it has data..." - # Count tables in the database table_count=$(psql -U postgres -h localhost -p 5432 -d "$db_name" -t -c "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'public';" 2>/dev/null || echo "0") if [ "$table_count" -gt 0 ]; then - echo "✓ Database '$db_name' already has $table_count tables. Skipping restoration." + echo "Database '$db_name' already has $table_count tables. Skipping restoration." continue else echo "Database '$db_name' exists but is empty. Proceeding with restoration..." fi else - # Create database if it doesn't exist echo "Creating database '$db_name'..." psql -U postgres -h localhost -p 5432 -c "CREATE DATABASE \"$db_name\";" 2>/dev/null || { echo "Warning: Could not create database '$db_name' (may already exist)" @@ -40,20 +36,18 @@ for dump_file in /data/*.dump; do # Use --clean --if-exists to handle existing objects gracefully if pg_restore -U postgres -h localhost -p 5432 --clean --if-exists --no-owner --no-privileges -d "$db_name" "$dump_file" 2>/dev/null; then - echo "✓ Successfully restored $dump_file" + echo "Successfully restored $dump_file" - # Verify restoration final_table_count=$(psql -U postgres -h localhost -p 5432 -d "$db_name" -t -c "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'public';" 2>/dev/null || echo "0") echo " Database '$db_name' now has $final_table_count tables" else - echo "✗ Warning: pg_restore reported errors for $dump_file (this may be normal for some dump formats)" + echo "Warning: pg_restore reported errors for $dump_file (this may be normal for some dump formats)" fi echo "" done -echo "Database dump restoration completed!" +echo "Database dump restoration completed" -# Show summary of all databases echo "Summary of available databases:" psql -U postgres -h localhost -p 5432 -c "\l" | grep -E "^\s+[a-zA-Z]" | head -20 From 326eb1f5b6877393335e7650ae7217f4f4d255e1 Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Tue, 19 Aug 2025 00:01:51 +0200 Subject: [PATCH 15/33] added comment to clarify that all code for "few_shot" is currently unused --- pidsmaker/factory.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pidsmaker/factory.py b/pidsmaker/factory.py index d1536d23..aa0a25cd 100644 --- a/pidsmaker/factory.py +++ b/pidsmaker/factory.py @@ -467,6 +467,7 @@ def objective_factory(cfg, in_dim, graph_reindexer, device, objective_cfg=None): raise ValueError(f"Invalid objective {objective}") # We wrap objectives into this class to calculate some metrics on validation set easily + # This is useful only if use_few_shot is True is_edge_type_prediction = objective_cfg.used_methods.strip() == "predict_edge_type" objectives = [ ValidationWrapper( From 5e162b11652c8ddaa968e98833b85a12c8af6cfc Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Tue, 19 Aug 2025 11:43:00 +0200 Subject: [PATCH 16/33] Removed useless code: edge_list is always None, so it's always redefined as edge_df --- pidsmaker/detection/training_methods/inference_loop.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pidsmaker/detection/training_methods/inference_loop.py b/pidsmaker/detection/training_methods/inference_loop.py index b67142c6..1a0ece38 100644 --- a/pidsmaker/detection/training_methods/inference_loop.py +++ b/pidsmaker/detection/training_methods/inference_loop.py @@ -29,7 +29,6 @@ def test_edge_level( ): model.eval() - edge_list = None start_time = data.t[0] all_losses = [] @@ -62,21 +61,17 @@ def test_edge_level( "edge_type": edge_types.astype(int), } ) - if edge_list is None: - edge_list = edge_df - else: - edge_list = pd.concat([edge_list, edge_df]) # Here is a checkpoint, which records all edge losses in the current time window time_interval = ( - ns_time_to_datetime_US(start_time) + "~" + ns_time_to_datetime_US(edge_list["time"].max()) + ns_time_to_datetime_US(start_time) + "~" + ns_time_to_datetime_US(edge_df["time"].max()) ) logs_dir = os.path.join(cfg.detection.gnn_training._edge_losses_dir, split, model_epoch_file) os.makedirs(logs_dir, exist_ok=True) csv_file = os.path.join(logs_dir, time_interval + ".csv") - edge_list.to_csv(csv_file, sep=",", header=True, index=False, encoding="utf-8") + edge_df.to_csv(csv_file, sep=",", header=True, index=False, encoding="utf-8") return all_losses From 135fdeffda9b24c8fd33dd56a39c31c75fad480c Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Tue, 19 Aug 2025 13:39:49 +0200 Subject: [PATCH 17/33] removed unused variable --- pidsmaker/factory.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pidsmaker/factory.py b/pidsmaker/factory.py index aa0a25cd..cc52ec35 100644 --- a/pidsmaker/factory.py +++ b/pidsmaker/factory.py @@ -81,8 +81,6 @@ def encoder_factory(cfg, msg_dim, in_dim, device, max_node_num, graph_reindexer) if use_tgn: in_dim = tgn_memory_dim - original_edge_dim = edge_dim - for method in map( lambda x: x.strip(), cfg.detection.gnn_training.encoder.used_methods.replace("-", ",").split(","), From 60516c52f9a0cefd1114140984be3a04e71f4aa9 Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Thu, 21 Aug 2025 18:36:25 +0200 Subject: [PATCH 18/33] revert: set PYTHONHASHSEED before launching Python interpreter again --- pidsmaker/utils/utils.py | 1 - scripts/run.sh | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pidsmaker/utils/utils.py b/pidsmaker/utils/utils.py index 8932f4a7..13654ded 100644 --- a/pidsmaker/utils/utils.py +++ b/pidsmaker/utils/utils.py @@ -653,7 +653,6 @@ def log_helper(label, dataset): def set_seed(cfg): seed = cfg.seed - os.environ["PYTHONHASHSEED"] = str(seed) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) diff --git a/scripts/run.sh b/scripts/run.sh index 4d2ddc55..4e3be119 100755 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -15,4 +15,4 @@ for arg in "$@"; do done # Execute the Python script with the passed arguments -nohup python ../pidsmaker/main.py $args --wandb & +PYTHONHASHSEED=0 nohup python ../pidsmaker/main.py $args --wandb & \ No newline at end of file From 17f00d9026eab52974f30ad177fbc651f113f002 Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Wed, 27 Aug 2025 10:24:14 +0200 Subject: [PATCH 19/33] fix seed argument type --- pidsmaker/config/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pidsmaker/config/pipeline.py b/pidsmaker/config/pipeline.py index 7dffc84c..a4ce1e4f 100644 --- a/pidsmaker/config/pipeline.py +++ b/pidsmaker/config/pipeline.py @@ -146,7 +146,7 @@ def get_runtime_required_args(return_unknown_args=False, args=None): "--database_port", default="5432", help="The port number for Postgres (default: 5432)" ) parser.add_argument("--sweep_id", default="", help="ID of a wandb sweep for multi-agent runs") - parser.add_argument("--seed", default=0, help="Random seed for reproducibility") + parser.add_argument("--seed", type=int, default=0, help="Random seed for reproducibility") parser.add_argument( "--artifact_dir", default="./artifacts/", help="Destination folder for generated files" ) From 7fbdea504e50bbd274de51727010a32073683f97 Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Wed, 27 Aug 2025 12:44:40 +0200 Subject: [PATCH 20/33] remove deprecated punkt in favor of punkt_tab and keep the tokenizer data in the project folder --- .gitignore | 3 +++ pidsmaker/utils/utils.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 86a7b927..375cde6d 100644 --- a/.gitignore +++ b/.gitignore @@ -180,3 +180,6 @@ postgres_lock/ postgres_config/ postgres_run/ postgres_log/ + +# tokenizer data +nltk_data/ diff --git a/pidsmaker/utils/utils.py b/pidsmaker/utils/utils.py index 13654ded..ace84aad 100644 --- a/pidsmaker/utils/utils.py +++ b/pidsmaker/utils/utils.py @@ -18,7 +18,8 @@ from nltk.tokenize import word_tokenize from tqdm import tqdm -nltk.download("punkt", quiet=True) +nltk.download("punkt_tab", quiet=True, download_dir="./nltk_data") +nltk.data.path.append("./nltk_data") from pidsmaker.config import update_cfg_for_multi_dataset From 1c7a39364f24db5a3a3c1496b6daae8dec050f5f Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Wed, 27 Aug 2025 15:59:20 +0200 Subject: [PATCH 21/33] code formatting with ruff --- .../evaluation_methods/node_evaluation.py | 110 +++++------------- pidsmaker/tgn.py | 4 +- pidsmaker/utils/data_utils.py | 2 +- 3 files changed, 31 insertions(+), 85 deletions(-) diff --git a/pidsmaker/detection/evaluation_methods/node_evaluation.py b/pidsmaker/detection/evaluation_methods/node_evaluation.py index a8b85702..42e21ab0 100644 --- a/pidsmaker/detection/evaluation_methods/node_evaluation.py +++ b/pidsmaker/detection/evaluation_methods/node_evaluation.py @@ -37,7 +37,7 @@ def get_node_predictions(val_tw_path, test_tw_path, cfg, **kwargs): log(f"Loading data from {test_tw_path}...") threshold_method = cfg.detection.evaluation.node_evaluation.threshold_method - if threshold_method == "magic": # data leaking by using test data + if threshold_method == "magic": # data leaking by using test data thr = get_threshold(test_tw_path, threshold_method) else: thr = get_threshold(val_tw_path, threshold_method) @@ -48,9 +48,7 @@ def get_node_predictions(val_tw_path, test_tw_path, cfg, **kwargs): node_to_max_loss = defaultdict(int) filelist = listdir_sorted(test_tw_path) - for tw, file in enumerate( - log_tqdm(sorted(filelist), desc="Compute labels") - ): + for tw, file in enumerate(log_tqdm(sorted(filelist), desc="Compute labels")): file = os.path.join(test_tw_path, file) df = pd.read_csv(file).to_dict(orient="records") for line in df: @@ -74,9 +72,7 @@ def get_node_predictions(val_tw_path, test_tw_path, cfg, **kwargs): # For plotting the scores of seen and unseen nodes graph_dir = cfg.preprocessing.transformation._graphs_dir - train_set_paths = get_all_files_from_folders( - graph_dir, cfg.dataset.train_files - ) + train_set_paths = get_all_files_from_folders(graph_dir, cfg.dataset.train_files) train_node_set = set() for train_path in train_set_paths: @@ -91,9 +87,7 @@ def get_node_predictions(val_tw_path, test_tw_path, cfg, **kwargs): ) results[node_id]["score"] = pred_score - results[node_id]["tw_with_max_loss"] = node_to_max_loss_tw.get( - node_id, -1 - ) + results[node_id]["tw_with_max_loss"] = node_to_max_loss_tw.get(node_id, -1) results[node_id]["y_true"] = int(node_id in ground_truth_nids) results[node_id]["is_seen"] = int(str(node_id) in train_node_set) @@ -126,9 +120,7 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs): node_to_max_loss = defaultdict(int) filelist = listdir_sorted(test_tw_path) - for tw, file in enumerate( - log_tqdm(sorted(filelist), desc="Compute labels") - ): + for tw, file in enumerate(log_tqdm(sorted(filelist), desc="Compute labels")): file = os.path.join(test_tw_path, file) df = pd.read_csv(file).to_dict(orient="records") for line in df: @@ -139,13 +131,9 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs): node_to_values[node]["tw"].append(tw) if "threatrace_score" in line: - node_to_values[node]["threatrace_score"].append( - line["threatrace_score"] - ) + node_to_values[node]["threatrace_score"].append(line["threatrace_score"]) if "correct_pred" in line: - node_to_values[node]["correct_pred"].append( - line["correct_pred"] - ) + node_to_values[node]["correct_pred"].append(line["correct_pred"]) if "flash_score" in line: node_to_values[node]["flash_score"].append(line["flash_score"]) if "magic_score" in line: @@ -157,9 +145,7 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs): # For plotting the scores of seen and unseen nodes graph_dir = cfg.preprocessing.transformation._graphs_dir - train_set_paths = get_all_files_from_folders( - graph_dir, cfg.dataset.train_files - ) + train_set_paths = get_all_files_from_folders(graph_dir, cfg.dataset.train_files) train_node_set = set() for train_path in train_set_paths: @@ -172,10 +158,7 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs): threatrace_label = 0 flash_label = 0 detected_tw = None - if ( - cfg.detection.evaluation.node_evaluation.threshold_method - == "threatrace" - ): + if cfg.detection.evaluation.node_evaluation.threshold_method == "threatrace": max_score = 0 pred_score = max(losses["threatrace_score"]) @@ -187,9 +170,7 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs): max_score = score detected_tw = tw - elif ( - cfg.detection.evaluation.node_evaluation.threshold_method == "flash" - ): + elif cfg.detection.evaluation.node_evaluation.threshold_method == "flash": max_score = 0 pred_score = max(losses["flash_score"]) @@ -201,9 +182,7 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs): max_score = score detected_tw = tw - elif ( - cfg.detection.evaluation.node_evaluation.threshold_method == "magic" - ): + elif cfg.detection.evaluation.node_evaluation.threshold_method == "magic": max_score = 0 pred_score = max(losses["magic_score"]) @@ -220,9 +199,7 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs): ) results[node_id]["score"] = pred_score - results[node_id]["tw_with_max_loss"] = node_to_max_loss_tw.get( - node_id, -1 - ) + results[node_id]["tw_with_max_loss"] = node_to_max_loss_tw.get(node_id, -1) results[node_id]["y_true"] = int(node_id in ground_truth_nids) results[node_id]["is_seen"] = int(str(node_id) in train_node_set) @@ -230,8 +207,7 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs): detected_tw = detected_tw or node_to_max_loss_tw.get(node_id, None) if detected_tw is not None: results[node_id]["time_range"] = [ - datetime_to_ns_time_US_handle_nano(tw) - for tw in filelist[detected_tw].split("~") + datetime_to_ns_time_US_handle_nano(tw) for tw in filelist[detected_tw].split("~") ] else: results[node_id]["time_range"] = None @@ -239,15 +215,9 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs): if use_kmeans: # in this mode, we add the label after results[node_id]["y_hat"] = 0 else: - if ( - cfg.detection.evaluation.node_evaluation.threshold_method - == "threatrace" - ): + if cfg.detection.evaluation.node_evaluation.threshold_method == "threatrace": results[node_id]["y_hat"] = threatrace_label - elif ( - cfg.detection.evaluation.node_evaluation.threshold_method - == "flash" - ): + elif cfg.detection.evaluation.node_evaluation.threshold_method == "flash": results[node_id]["y_hat"] = flash_label else: results[node_id]["y_hat"] = int(pred_score > thr) @@ -263,11 +233,7 @@ def get_node_predictions_node_level(val_tw_path, test_tw_path, cfg, **kwargs): def analyze_false_positives( y_truth, y_preds, pred_scores, max_val_loss_tw, nodes, tw_to_malicious_nodes ): - fp_indices = [ - i - for i, (true, pred) in enumerate(zip(y_truth, y_preds)) - if pred and not true - ] + fp_indices = [i for i, (true, pred) in enumerate(zip(y_truth, y_preds)) if pred and not true] malicious_tws = set(tw_to_malicious_nodes.keys()) num_fps_in_malicious_tw = 0 @@ -276,9 +242,7 @@ def analyze_false_positives( num_fps_in_malicious_tw += int(is_in_malicious_tw) fp_in_malicious_tw_ratio = ( - num_fps_in_malicious_tw / len(fp_indices) - if len(fp_indices) > 0 - else float("nan") + num_fps_in_malicious_tw / len(fp_indices) if len(fp_indices) > 0 else float("nan") ) return fp_in_malicious_tw_ratio @@ -296,15 +260,11 @@ def main( else: get_preds_fn = get_node_predictions - results, thr = get_preds_fn( - cfg=cfg, val_tw_path=val_tw_path, test_tw_path=test_tw_path - ) + results, thr = get_preds_fn(cfg=cfg, val_tw_path=val_tw_path, test_tw_path=test_tw_path) # save results for future checking os.makedirs(cfg.detection.evaluation._results_dir, exist_ok=True) - results_save_dir = os.path.join( - cfg.detection.evaluation._results_dir, "results.pth" - ) + results_save_dir = os.path.join(cfg.detection.evaluation._results_dir, "results.pth") torch.save(results, results_save_dir) log(f"Resutls saved to {results_save_dir}") @@ -318,15 +278,9 @@ def main( ) # average detection precision scores_img_file = os.path.join(out_dir, f"scores_{model_epoch_dir}.png") # simple_scores_img_file = os.path.join(out_dir, f"simple_scores_{model_epoch_dir}.png") - neat_scores_img_file = os.path.join( - out_dir, f"neat_scores_{model_epoch_dir}.svg" - ) - seen_score_img_file = os.path.join( - out_dir, f"seen_score_{model_epoch_dir}.png" - ) - discrim_img_file = os.path.join( - out_dir, f"discrim_curve_{model_epoch_dir}.png" - ) + neat_scores_img_file = os.path.join(out_dir, f"neat_scores_{model_epoch_dir}.svg") + seen_score_img_file = os.path.join(out_dir, f"seen_score_{model_epoch_dir}.png") + discrim_img_file = os.path.join(out_dir, f"discrim_curve_{model_epoch_dir}.png") attack_to_GPs = get_GP_of_each_attack(cfg) attack_to_TPs = defaultdict(int) @@ -370,13 +324,9 @@ def main( adp_score = plot_detected_attacks_vs_precision( pred_scores, nodes, node2attacks, y_truth, adp_img_file ) - discrim_scores = compute_discrimination_score( - pred_scores, nodes, node2attacks, y_truth - ) + discrim_scores = compute_discrimination_score(pred_scores, nodes, node2attacks, y_truth) plot_discrimination_metric(pred_scores, y_truth, discrim_img_file) - discrim_tp = compute_discrimination_tp( - pred_scores, nodes, node2attacks, y_truth - ) + discrim_tp = compute_discrimination_tp(pred_scores, nodes, node2attacks, y_truth) # plot_simple_scores(pred_scores, y_truth, simple_scores_img_file) plot_scores_with_paths_node_level( pred_scores, @@ -389,9 +339,7 @@ def main( cfg, thr, ) - plot_scores_neat( - pred_scores, y_truth, nodes, node2attacks, neat_scores_img_file, thr - ) + plot_scores_neat(pred_scores, y_truth, nodes, node2attacks, neat_scores_img_file, thr) # plot_score_seen(pred_scores, is_seen, seen_score_img_file) stats = classifier_evaluation(y_truth, y_preds, pred_scores) @@ -412,9 +360,7 @@ def main( tps_in_atts.append((att, tps)) stats["percent_detected_attacks"] = ( - round(len(attack_to_TPs) / len(attack_to_GPs), 2) - if len(attack_to_GPs) > 0 - else 0 + round(len(attack_to_TPs) / len(attack_to_GPs), 2) if len(attack_to_GPs) > 0 else 0 ) fps, tps, precision, recall = get_metrics_if_all_attacks_detected( @@ -430,9 +376,7 @@ def main( for k, v in discrim_scores.items(): stats[k] = round(v, 4) - attack2tps = get_detected_tps_node_level( - pred_scores, nodes, node2attacks, y_truth, cfg - ) + attack2tps = get_detected_tps_node_level(pred_scores, nodes, node2attacks, y_truth, cfg) for attack, detected_tps in attack2tps.items(): stats[f"tps_{attack}"] = str(detected_tps) diff --git a/pidsmaker/tgn.py b/pidsmaker/tgn.py index 1436dac8..80bdf811 100644 --- a/pidsmaker/tgn.py +++ b/pidsmaker/tgn.py @@ -392,7 +392,9 @@ def insert(self, src: Tensor, dst: Tensor): # Compute cumulative start indices cum_edge_counts = torch.cat([torch.tensor([0], device=nodes.device), edge_counts.cumsum(0)]) - local_slots = torch.arange(nodes.size(0), device=nodes.device) - cum_edge_counts[self._assoc[nodes]] + local_slots = ( + torch.arange(nodes.size(0), device=nodes.device) - cum_edge_counts[self._assoc[nodes]] + ) dense_id = local_slots + (self._assoc[nodes] * temp_size) # Initialize dense tensors with temporary size diff --git a/pidsmaker/utils/data_utils.py b/pidsmaker/utils/data_utils.py index aefa25a0..0b914a76 100644 --- a/pidsmaker/utils/data_utils.py +++ b/pidsmaker/utils/data_utils.py @@ -704,7 +704,7 @@ def inter_batching(dataset, method): ): batch = data_list[i : i + bs] data = collate(CollatableTemporalData, data_list=batch)[0] - + use_tgn = "tgn" in cfg.detection.gnn_training.encoder.used_methods if cfg._debug and use_tgn: debug_test_batching(batch, data, cfg) From 66680ed4aa7a766ab4070cbc1dca4ec50a91d976 Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Wed, 27 Aug 2025 17:20:23 +0200 Subject: [PATCH 22/33] restore previous formatting for config.py --- pidsmaker/config/config.py | 160 ++++++++----------------------------- 1 file changed, 33 insertions(+), 127 deletions(-) diff --git a/pidsmaker/config/config.py b/pidsmaker/config/config.py index aa2f8d1d..8ad8a567 100644 --- a/pidsmaker/config/config.py +++ b/pidsmaker/config/config.py @@ -81,11 +81,7 @@ "E5-CADETS/node_Nginx_Drakon_APT_17.csv", ], "attack_to_time_window": [ - [ - "E5-CADETS/node_Nginx_Drakon_APT.csv", - "2019-05-16 09:31:00", - "2019-05-16 10:12:00", - ], + ["E5-CADETS/node_Nginx_Drakon_APT.csv", "2019-05-16 09:31:00", "2019-05-16 10:12:00"], [ "E5-CADETS/node_Nginx_Drakon_APT_17.csv", "2019-05-17 10:15:00", @@ -121,22 +117,10 @@ "E3-CADETS/node_Nginx_Backdoor_13.csv", ], "attack_to_time_window": [ - [ - "E3-CADETS/node_Nginx_Backdoor_06.csv", - "2018-04-06 11:20:00", - "2018-04-06 12:09:00", - ], + ["E3-CADETS/node_Nginx_Backdoor_06.csv", "2018-04-06 11:20:00", "2018-04-06 12:09:00"], # ["E3-CADETS/node_Nginx_Backdoor_11.csv" , '2018-04-11 15:07:00', '2018-04-11 15:16:00'], - [ - "E3-CADETS/node_Nginx_Backdoor_12.csv", - "2018-04-12 13:59:00", - "2018-04-12 14:39:00", - ], - [ - "E3-CADETS/node_Nginx_Backdoor_13.csv", - "2018-04-13 09:03:00", - "2018-04-13 09:16:00", - ], + ["E3-CADETS/node_Nginx_Backdoor_12.csv", "2018-04-12 13:59:00", "2018-04-12 14:39:00"], + ["E3-CADETS/node_Nginx_Backdoor_13.csv", "2018-04-13 09:03:00", "2018-04-13 09:16:00"], ], }, "CLEARSCOPE_E5": { @@ -147,13 +131,7 @@ "num_edge_types": 10, "year_month": "2019-05", "start_end_day_range": (8, 18), - "train_files": [ - "graph_8", - "graph_9", - "graph_10", - "graph_11", - "graph_12", - ], + "train_files": ["graph_8", "graph_9", "graph_10", "graph_11", "graph_12"], "val_files": ["graph_13"], "test_files": ["graph_14", "graph_15", "graph_17"], "unused_files": ["graph_16"], @@ -231,11 +209,7 @@ "h201/node_h201_0923.csv", ], "attack_to_time_window": [ - [ - "h201/node_h201_0923.csv", - "2019-09-23 11:23:00", - "2019-09-23 13:25:00", - ], + ["h201/node_h201_0923.csv", "2019-09-23 11:23:00", "2019-09-23 13:25:00"], ], }, "optc_h501": { @@ -254,11 +228,7 @@ "h501/node_h501_0924.csv", ], "attack_to_time_window": [ - [ - "h501/node_h501_0924.csv", - "2019-09-24 10:28:00", - "2019-09-24 15:29:00", - ], + ["h501/node_h501_0924.csv", "2019-09-24 10:28:00", "2019-09-24 15:29:00"], ], }, "optc_h051": { @@ -277,11 +247,7 @@ "h051/node_h051_0925.csv", ], "attack_to_time_window": [ - [ - "h051/node_h051_0925.csv", - "2019-09-25 10:29:00", - "2019-09-25 14:25:00", - ], + ["h051/node_h051_0925.csv", "2019-09-25 10:29:00", "2019-09-25 14:25:00"], ], }, } @@ -435,8 +401,7 @@ def __init__(self, type, vals: list = None, desc: str = None): output size matching the downstream objective (e.g. edge type prediction involves predicting 10 edge types, so the output of the decoder should be 10).", ), "src_dst_projection_coef": Arg( - int, - desc="Multiplier of input neurons to project src and dst nodes.", + int, desc="Multiplier of input neurons to project src and dst nodes." ), }, "node_mlp": { @@ -472,9 +437,7 @@ def __init__(self, type, vals: list = None, desc: str = None): # Prediction-based "predict_edge_type": { "decoder": Arg( - str, - vals=OR(list(DECODERS_CFG.keys())), - desc="Decoder used before computing loss.", + str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss." ), **DECODERS_CFG, "balanced_loss": Arg(bool), @@ -482,9 +445,7 @@ def __init__(self, type, vals: list = None, desc: str = None): }, "predict_node_type": { "decoder": Arg( - str, - vals=OR(list(DECODERS_CFG.keys())), - desc="Decoder used before computing loss.", + str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss." ), **DECODERS_CFG, "balanced_loss": Arg(bool), @@ -492,26 +453,20 @@ def __init__(self, type, vals: list = None, desc: str = None): "predict_masked_struct": { "loss": Arg(str, vals=OR(PRED_LOSSES)), "decoder": Arg( - str, - vals=OR(list(DECODERS_CFG.keys())), - desc="Decoder used before computing loss.", + str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss." ), **DECODERS_CFG, "balanced_loss": Arg(bool), }, "detect_edge_few_shot": { "decoder": Arg( - str, - vals=OR(list(DECODERS_CFG.keys())), - desc="Decoder used before computing loss.", + str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss." ), **DECODERS_CFG, }, "predict_edge_contrastive": { "decoder": Arg( - str, - vals=OR(list(DECODERS_CFG.keys())), - desc="Decoder used before computing loss.", + str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss." ), **DECODERS_CFG, "inner_product": { @@ -522,27 +477,21 @@ def __init__(self, type, vals: list = None, desc: str = None): "reconstruct_node_features": { "loss": Arg(str, vals=OR(RECON_LOSSES)), "decoder": Arg( - str, - vals=OR(list(DECODERS_CFG.keys())), - desc="Decoder used before computing loss.", + str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss." ), **DECODERS_CFG, }, "reconstruct_node_embeddings": { "loss": Arg(str, vals=OR(RECON_LOSSES)), "decoder": Arg( - str, - vals=OR(list(DECODERS_CFG.keys())), - desc="Decoder used before computing loss.", + str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss." ), **DECODERS_CFG, }, "reconstruct_edge_embeddings": { "loss": Arg(str, vals=OR(RECON_LOSSES)), "decoder": Arg( - str, - vals=OR(list(DECODERS_CFG.keys())), - desc="Decoder used before computing loss.", + str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss." ), **DECODERS_CFG, }, @@ -550,9 +499,7 @@ def __init__(self, type, vals: list = None, desc: str = None): "loss": Arg(str, vals=OR(RECON_LOSSES)), "mask_rate": Arg(float), "decoder": Arg( - str, - vals=OR(list(DECODERS_CFG.keys())), - desc="Decoder used before computing loss.", + str, vals=OR(list(DECODERS_CFG.keys())), desc="Decoder used before computing loss." ), **DECODERS_CFG, }, @@ -567,23 +514,14 @@ def __init__(self, type, vals: list = None, desc: str = None): }, } -THRESHOLD_METHODS = [ - "max_val_loss", - "mean_val_loss", - "threatrace", - "magic", - "flash", - "nodlink", -] +THRESHOLD_METHODS = ["max_val_loss", "mean_val_loss", "threatrace", "magic", "flash", "nodlink"] # --- Tasks, subtasks, and argument configurations --- TASK_ARGS = { "preprocessing": { "build_graphs": { "used_method": Arg( - str, - vals=OR(["default", "magic"]), - desc="The method to build time window graphs.", + str, vals=OR(["default", "magic"]), desc="The method to build time window graphs." ), "use_all_files": Arg(bool), "mimicry_edge_num": Arg(int), @@ -593,8 +531,7 @@ def __init__(self, type, vals: list = None, desc: str = None): ), "use_hashed_label": Arg(bool, desc="Whether to hash the textual features."), "fuse_edge": Arg( - bool, - desc="Whether to fuse duplicate sequential edges into a single edge.", + bool, desc="Whether to fuse duplicate sequential edges into a single edge." ), "node_label_features": { "subject": Arg( @@ -609,9 +546,7 @@ def __init__(self, type, vals: list = None, desc: str = None): ), "netflow": Arg( str, - vals=AND( - ["type", "remote_ip", "remote_port"], - ), + vals=AND(["type", "remote_ip", "remote_port"]), desc="Which features use for netflow nodes. Features will be concatenated.", ), }, @@ -643,8 +578,7 @@ def __init__(self, type, vals: list = None, desc: str = None): desc="Size of the text embedding. Arg not used by some featurization methods that do not build embeddings.", ), "epochs": Arg( - int, - desc="Epochs to train the embedding method. Arg not used by some methods.", + int, desc="Epochs to train the embedding method. Arg not used by some methods." ), "training_split": Arg( str, @@ -682,22 +616,13 @@ def __init__(self, type, vals: list = None, desc: str = None): ), "edge_features": Arg( str, - vals=AND( - [ - "edge_type", - "edge_type_triplet", - "msg", - "time_encoding", - "none", - ] - ), + vals=AND(["edge_type", "edge_type_triplet", "msg", "time_encoding", "none"]), desc="Edge features to used during GNN training. `edge_type` refers to the system call type, `edge_type_triplet` \ considers a same edge type as a new type if source or destination node types are different, `msg` is the message vector \ used in the TGN, `time_encoding` encodes temporal order of events with their timestamps in the TGN, `none` uses no features.", ), "multi_dataset_training": Arg( - bool, - desc="Whether the GNN should be trained on all datasets in `multi_dataset`.", + bool, desc="Whether the GNN should be trained on all datasets in `multi_dataset`." ), "fix_buggy_graph_reindexer": Arg( bool, @@ -740,8 +665,7 @@ def __init__(self, type, vals: list = None, desc: str = None): }, "tgn_last_neighbor": { "tgn_neighbor_size": Arg( - int, - desc="Number of last neighbors to store for each node.", + int, desc="Number of last neighbors to store for each node." ), "tgn_neighbor_n_hop": Arg( int, @@ -782,26 +706,19 @@ def __init__(self, type, vals: list = None, desc: str = None): }, "gnn_training": { "deterministic": Arg( - bool, - desc="Whether to force PyTorch to use deterministic algorithms.", + bool, desc="Whether to force PyTorch to use deterministic algorithms." ), "num_epochs": Arg(int), "patience": Arg(int), "lr": Arg(float), "weight_decay": Arg(float), - "node_hid_dim": Arg( - int, - desc="Number of neurons in the middle layers of the encoder.", - ), + "node_hid_dim": Arg(int, desc="Number of neurons in the middle layers of the encoder."), "node_out_dim": Arg(int, desc="Number of neurons in the last layer of the encoder."), "grad_accumulation": Arg( - int, - desc="Number of epochs to gather gradients before backprop.", + int, desc="Number of epochs to gather gradients before backprop." ), "inference_device": Arg( - str, - vals=OR(["cpu", "cuda"]), - desc="Device used during testing.", + str, vals=OR(["cpu", "cuda"]), desc="Device used during testing." ), "used_method": Arg(str, vals=OR(["default"]), desc="Which training pipeline use."), "encoder": { @@ -859,12 +776,10 @@ def __init__(self, type, vals: list = None, desc: str = None): desc="Whether to consider the loss of destination nodes when computing the node-level scores (maximum loss of a node).", ), "use_kmeans": Arg( - bool, - desc="Whether to cluster nodes after thresholding as done in Orthrus", + bool, desc="Whether to cluster nodes after thresholding as done in Orthrus" ), "kmeans_top_K": Arg( - int, - desc="Number of top-score nodes selected before clustering.", + int, desc="Number of top-score nodes selected before clustering." ), }, "tw_evaluation": { @@ -920,16 +835,7 @@ def __init__(self, type, vals: list = None, desc: str = None): ), "depimpact": { "used_method": Arg( - str, - vals=OR( - [ - "component", - "shortest_path", - "1-hop", - "2-hop", - "3-hop", - ] - ), + str, vals=OR(["component", "shortest_path", "1-hop", "2-hop", "3-hop"]) ), "score_method": Arg(str, vals=OR(["degree", "recon_loss", "degree_recon"])), "workers": Arg(int), From 0a48403808e5713993d9a418ceaf42e30f637bf3 Mon Sep 17 00:00:00 2001 From: tristan Date: Wed, 22 Oct 2025 20:37:10 -0700 Subject: [PATCH 23/33] Revert "Removal of separate calls to set_seed for each task. This reverts commit 6c211fbd358501a2deedb877a6291bd6b5e27284. --- config/flash.yml | 2 ++ config/kairos.yml | 1 + config/magic.yml | 1 + config/nodlink.yml | 2 ++ config/orthrus.yml | 2 ++ config/rcaid.yml | 2 ++ config/threatrace.yml | 1 + config/tuned_components/tuned_alacarte.yml | 1 + config/tuned_components/tuned_doc2vec.yml | 1 + config/tuned_components/tuned_fasttext.yml | 1 + config/tuned_components/tuned_flash.yml | 1 + config/tuned_components/tuned_word2vec.yml | 1 + docs/scripts/args/args_detection.md | 1 + docs/scripts/args/args_featurization.md | 1 + pidsmaker/config/config.py | 2 ++ pidsmaker/config/pipeline.py | 1 - pidsmaker/detection/graph_preprocessing.py | 3 +- .../training_methods/inference_loop.py | 3 ++ .../training_methods/training_loop.py | 4 ++- pidsmaker/featurization/feat_training.py | 4 +++ .../feat_training_alacarte.py | 36 +++++++++++++------ .../feat_training_doc2vec.py | 3 +- .../feat_training_fasttext.py | 4 ++- .../feat_training_trw.py | 4 ++- .../feat_training_word2vec.py | 4 ++- pidsmaker/preprocessing/transformation.py | 2 ++ pidsmaker/utils/utils.py | 14 +++++--- scripts/run.sh | 2 +- 28 files changed, 80 insertions(+), 24 deletions(-) diff --git a/config/flash.yml b/config/flash.yml index 303fa479..42c8e340 100644 --- a/config/flash.yml +++ b/config/flash.yml @@ -20,6 +20,7 @@ featurization: epochs: 10 # 300 training_split: train used_method: flash + use_seed: True flash: min_count: 1 workers: 15 @@ -40,6 +41,7 @@ detection: used_method: none gnn_training: used_method: default + use_seed: True deterministic: False num_epochs: 12 patience: 3 diff --git a/config/kairos.yml b/config/kairos.yml index e1131495..e21386ea 100644 --- a/config/kairos.yml +++ b/config/kairos.yml @@ -45,6 +45,7 @@ detection: used_method: none gnn_training: used_method: default + use_seed: True deterministic: False num_epochs: 12 patience: 3 diff --git a/config/magic.yml b/config/magic.yml index 6b3fc6c2..d857f350 100644 --- a/config/magic.yml +++ b/config/magic.yml @@ -34,6 +34,7 @@ detection: used_method: none gnn_training: used_method: default + use_seed: True deterministic: False num_epochs: 12 patience: 3 diff --git a/config/nodlink.yml b/config/nodlink.yml index 52cacd9d..046ee648 100644 --- a/config/nodlink.yml +++ b/config/nodlink.yml @@ -18,6 +18,7 @@ featurization: feat_training: epochs: 100 emb_dim: 256 + use_seed: True training_split: train used_method: fasttext fasttext: @@ -44,6 +45,7 @@ detection: used_method: none gnn_training: used_method: default + use_seed: True deterministic: False num_epochs: 12 patience: 3 diff --git a/config/orthrus.yml b/config/orthrus.yml index 3b3d045a..22506f89 100644 --- a/config/orthrus.yml +++ b/config/orthrus.yml @@ -21,6 +21,7 @@ featurization: emb_dim: 128 epochs: 50 training_split: all + use_seed: True used_method: word2vec word2vec: alpha: 0.025 @@ -57,6 +58,7 @@ detection: used_method: none gnn_training: used_method: default + use_seed: True deterministic: False num_epochs: 12 patience: 3 diff --git a/config/rcaid.yml b/config/rcaid.yml index dff311d1..aff722c0 100644 --- a/config/rcaid.yml +++ b/config/rcaid.yml @@ -20,6 +20,7 @@ featurization: feat_training: epochs: 5 emb_dim: 128 + use_seed: True training_split: all used_method: doc2vec doc2vec: @@ -42,6 +43,7 @@ detection: used_method: none gnn_training: used_method: default + use_seed: True deterministic: False num_epochs: 12 patience: 3 diff --git a/config/threatrace.yml b/config/threatrace.yml index 809267ab..6d064fe0 100644 --- a/config/threatrace.yml +++ b/config/threatrace.yml @@ -34,6 +34,7 @@ detection: used_method: none gnn_training: used_method: default + use_seed: True deterministic: False num_epochs: 12 patience: 3 diff --git a/config/tuned_components/tuned_alacarte.yml b/config/tuned_components/tuned_alacarte.yml index 2048c55a..958e1002 100644 --- a/config/tuned_components/tuned_alacarte.yml +++ b/config/tuned_components/tuned_alacarte.yml @@ -2,6 +2,7 @@ featurization: feat_training: epochs: 10 emb_dim: 128 + use_seed: True training_split: train used_method: alacarte alacarte: diff --git a/config/tuned_components/tuned_doc2vec.yml b/config/tuned_components/tuned_doc2vec.yml index 93d4d8dd..4824ddec 100644 --- a/config/tuned_components/tuned_doc2vec.yml +++ b/config/tuned_components/tuned_doc2vec.yml @@ -2,6 +2,7 @@ featurization: feat_training: epochs: 10 emb_dim: 128 + use_seed: True training_split: train used_method: doc2vec doc2vec: diff --git a/config/tuned_components/tuned_fasttext.yml b/config/tuned_components/tuned_fasttext.yml index 0e2a1b32..feb351fe 100644 --- a/config/tuned_components/tuned_fasttext.yml +++ b/config/tuned_components/tuned_fasttext.yml @@ -2,6 +2,7 @@ featurization: feat_training: epochs: 10 emb_dim: 256 + use_seed: True training_split: train used_method: fasttext fasttext: diff --git a/config/tuned_components/tuned_flash.yml b/config/tuned_components/tuned_flash.yml index 7c058644..01d1b57e 100644 --- a/config/tuned_components/tuned_flash.yml +++ b/config/tuned_components/tuned_flash.yml @@ -4,6 +4,7 @@ featurization: epochs: 10 training_split: train used_method: flash + use_seed: True flash: min_count: 1 workers: 10 diff --git a/config/tuned_components/tuned_word2vec.yml b/config/tuned_components/tuned_word2vec.yml index b8bb8696..5f38ae1f 100644 --- a/config/tuned_components/tuned_word2vec.yml +++ b/config/tuned_components/tuned_word2vec.yml @@ -3,6 +3,7 @@ featurization: epochs: 10 emb_dim: 128 training_split: train + use_seed: True used_method: word2vec word2vec: alpha: 0.025 diff --git a/docs/scripts/args/args_detection.md b/docs/scripts/args/args_detection.md index 676ba1cf..2f888bd0 100644 --- a/docs/scripts/args/args_detection.md +++ b/docs/scripts/args/args_detection.md @@ -45,6 +45,7 @@
      • gnn_training
          +
        • use_seed: bool
        • deterministic: bool (19)
        • num_epochs: int
        • patience: int
        • diff --git a/docs/scripts/args/args_featurization.md b/docs/scripts/args/args_featurization.md index 5bd8a81c..c404f12a 100644 --- a/docs/scripts/args/args_featurization.md +++ b/docs/scripts/args/args_featurization.md @@ -5,6 +5,7 @@
          • emb_dim: int (1)
          • epochs: int (2)
          • +
          • use_seed: bool
          • training_split: str (3)
          • multi_dataset_training: bool (4)
          • used_method: str (5)
          • diff --git a/pidsmaker/config/config.py b/pidsmaker/config/config.py index 8ad8a567..b80d3f85 100644 --- a/pidsmaker/config/config.py +++ b/pidsmaker/config/config.py @@ -580,6 +580,7 @@ def __init__(self, type, vals: list = None, desc: str = None): "epochs": Arg( int, desc="Epochs to train the embedding method. Arg not used by some methods." ), + "use_seed": Arg(bool), "training_split": Arg( str, vals=OR(["train", "all"]), @@ -705,6 +706,7 @@ def __init__(self, type, vals: list = None, desc: str = None): }, }, "gnn_training": { + "use_seed": Arg(bool), "deterministic": Arg( bool, desc="Whether to force PyTorch to use deterministic algorithms." ), diff --git a/pidsmaker/config/pipeline.py b/pidsmaker/config/pipeline.py index a4ce1e4f..86e74bc6 100644 --- a/pidsmaker/config/pipeline.py +++ b/pidsmaker/config/pipeline.py @@ -146,7 +146,6 @@ def get_runtime_required_args(return_unknown_args=False, args=None): "--database_port", default="5432", help="The port number for Postgres (default: 5432)" ) parser.add_argument("--sweep_id", default="", help="ID of a wandb sweep for multi-agent runs") - parser.add_argument("--seed", type=int, default=0, help="Random seed for reproducibility") parser.add_argument( "--artifact_dir", default="./artifacts/", help="Destination folder for generated files" ) diff --git a/pidsmaker/detection/graph_preprocessing.py b/pidsmaker/detection/graph_preprocessing.py index cc31954f..26ba214a 100644 --- a/pidsmaker/detection/graph_preprocessing.py +++ b/pidsmaker/detection/graph_preprocessing.py @@ -3,7 +3,7 @@ import torch from pidsmaker.utils.data_utils import load_all_datasets -from pidsmaker.utils.utils import get_device, log, log_start +from pidsmaker.utils.utils import get_device, log, log_start, set_seed def get_preprocessed_graphs(cfg): @@ -22,6 +22,7 @@ def get_preprocessed_graphs(cfg): def main(cfg): + set_seed(cfg) log_start(__file__) if cfg.detection.graph_preprocessing.save_on_disk: diff --git a/pidsmaker/detection/training_methods/inference_loop.py b/pidsmaker/detection/training_methods/inference_loop.py index 1a0ece38..a84328ee 100644 --- a/pidsmaker/detection/training_methods/inference_loop.py +++ b/pidsmaker/detection/training_methods/inference_loop.py @@ -15,6 +15,7 @@ log, log_tqdm, ns_time_to_datetime_US, + set_seed, ) @@ -258,6 +259,8 @@ def test_node_level( def main(cfg, model, val_data, test_data, epoch, split, logging=True): + set_seed(cfg) + if split == "all": splits = [(val_data, "val"), (test_data, "test")] elif split == "val": diff --git a/pidsmaker/detection/training_methods/training_loop.py b/pidsmaker/detection/training_methods/training_loop.py index 8464e1b8..06129ea5 100644 --- a/pidsmaker/detection/training_methods/training_loop.py +++ b/pidsmaker/detection/training_methods/training_loop.py @@ -12,12 +12,14 @@ optimizer_factory, optimizer_few_shot_factory, ) -from pidsmaker.utils.utils import get_device, log, log_start, log_tqdm +from pidsmaker.utils.utils import get_device, log, log_start, log_tqdm, set_seed from . import inference_loop def main(cfg): + set_seed(cfg) + log_start(__file__) device = get_device(cfg) use_cuda = device == torch.device("cuda") diff --git a/pidsmaker/featurization/feat_training.py b/pidsmaker/featurization/feat_training.py index deb145ab..c10374b8 100644 --- a/pidsmaker/featurization/feat_training.py +++ b/pidsmaker/featurization/feat_training.py @@ -1,3 +1,5 @@ +from pidsmaker.utils.utils import set_seed + from .feat_training_methods import ( build_trw, feat_training_alacarte, @@ -11,6 +13,8 @@ def main(cfg): + set_seed(cfg) + method = cfg.featurization.feat_training.used_method.strip() if method == "alacarte": build_random_walks.main(cfg) diff --git a/pidsmaker/featurization/feat_training_methods/feat_training_alacarte.py b/pidsmaker/featurization/feat_training_methods/feat_training_alacarte.py index b427b729..ad8b4d69 100644 --- a/pidsmaker/featurization/feat_training_methods/feat_training_alacarte.py +++ b/pidsmaker/featurization/feat_training_methods/feat_training_alacarte.py @@ -444,6 +444,8 @@ def feat_training_for_one_split( num_workers = cfg.featurization.feat_training.alacarte.num_workers compute_loss = cfg.featurization.feat_training.alacarte.compute_loss add_paths = cfg.featurization.feat_training.alacarte.add_paths + use_seed = cfg.featurization.feat_training.use_seed + SEED = 0 log_dir = out_dir @@ -483,17 +485,29 @@ def feat_training_for_one_split( # Training using Word2Vec if needed # ===-----------------------------------------------------------------------=== if model_input is None: - model = Word2Vec( - paths, - vector_size=emb_dim, - window=window_size, - min_count=min_count, - sg=use_skip_gram, - workers=num_workers, - epochs=epochs, - compute_loss=compute_loss, - seed=cfg.seed, - ) + if use_seed: + model = Word2Vec( + paths, + vector_size=emb_dim, + window=window_size, + min_count=min_count, + sg=use_skip_gram, + workers=num_workers, + epochs=epochs, + compute_loss=compute_loss, + seed=SEED, + ) + else: + model = Word2Vec( + paths, + vector_size=emb_dim, + window=window_size, + min_count=min_count, + sg=use_skip_gram, + workers=num_workers, + epochs=epochs, + compute_loss=compute_loss, + ) else: log("Loading existing model from: {}".format(model_input)) model = Word2Vec.load(model_input) diff --git a/pidsmaker/featurization/feat_training_methods/feat_training_doc2vec.py b/pidsmaker/featurization/feat_training_methods/feat_training_doc2vec.py index 3fb67883..2d0b35c3 100644 --- a/pidsmaker/featurization/feat_training_methods/feat_training_doc2vec.py +++ b/pidsmaker/featurization/feat_training_methods/feat_training_doc2vec.py @@ -18,8 +18,9 @@ def doc2vec( alpha: float, dm: int = 1, ): + SEED = 0 model = Doc2Vec( - vector_size=emb_dim, alpha=alpha, min_count=1, dm=dm, compute_loss=True, seed=cfg.seed + vector_size=emb_dim, alpha=alpha, min_count=1, dm=dm, compute_loss=True, seed=SEED ) model.build_vocab(tagged_data) diff --git a/pidsmaker/featurization/feat_training_methods/feat_training_fasttext.py b/pidsmaker/featurization/feat_training_methods/feat_training_fasttext.py index b368bd98..07575efa 100644 --- a/pidsmaker/featurization/feat_training_methods/feat_training_fasttext.py +++ b/pidsmaker/featurization/feat_training_methods/feat_training_fasttext.py @@ -23,6 +23,8 @@ def train_fasttext(corpus, cfg): min_count = cfg.featurization.feat_training.fasttext.min_count num_workers = cfg.featurization.feat_training.fasttext.num_workers negative = cfg.featurization.feat_training.fasttext.negative + use_seed = cfg.featurization.feat_training.use_seed + SEED = 0 use_pretrained_fb_model = cfg.featurization.feat_training.fasttext.use_pretrained_fb_model @@ -42,7 +44,7 @@ def train_fasttext(corpus, cfg): alpha=alpha, window=window_size, negative=negative, - seed=cfg.seed, + seed=SEED, ) model.build_vocab(corpus, update=use_pretrained_fb_model) diff --git a/pidsmaker/featurization/feat_training_methods/feat_training_trw.py b/pidsmaker/featurization/feat_training_methods/feat_training_trw.py index e5a45f6f..d8ba7dc0 100644 --- a/pidsmaker/featurization/feat_training_methods/feat_training_trw.py +++ b/pidsmaker/featurization/feat_training_methods/feat_training_trw.py @@ -41,6 +41,8 @@ def train_word2vec(corpus, model_save_path, cfg): epochs = cfg.featurization.feat_training.epochs compute_loss = cfg.featurization.feat_training.temporal_rw.compute_loss negative = cfg.featurization.feat_training.temporal_rw.negative + use_seed = cfg.featurization.feat_training.use_seed + SEED = 0 model = Word2Vec( corpus, @@ -52,7 +54,7 @@ def train_word2vec(corpus, model_save_path, cfg): epochs=1, compute_loss=compute_loss, negative=negative, - seed=cfg.seed, + seed=SEED, ) epoch_loss = model.get_latest_training_loss() diff --git a/pidsmaker/featurization/feat_training_methods/feat_training_word2vec.py b/pidsmaker/featurization/feat_training_methods/feat_training_word2vec.py index 08c7a751..42216b79 100644 --- a/pidsmaker/featurization/feat_training_methods/feat_training_word2vec.py +++ b/pidsmaker/featurization/feat_training_methods/feat_training_word2vec.py @@ -16,6 +16,8 @@ def train_word2vec(corpus, cfg, model_save_path): epochs = cfg.featurization.feat_training.epochs compute_loss = cfg.featurization.feat_training.word2vec.compute_loss negative = cfg.featurization.feat_training.word2vec.negative + use_seed = cfg.featurization.feat_training.use_seed + SEED = 0 model = Word2Vec( corpus, @@ -28,7 +30,7 @@ def train_word2vec(corpus, cfg, model_save_path): epochs=1, compute_loss=compute_loss, negative=negative, - seed=cfg.seed, + seed=SEED, ) epoch_loss = model.get_latest_training_loss() diff --git a/pidsmaker/preprocessing/transformation.py b/pidsmaker/preprocessing/transformation.py index ccf2bb7d..a6d7d43d 100644 --- a/pidsmaker/preprocessing/transformation.py +++ b/pidsmaker/preprocessing/transformation.py @@ -16,6 +16,7 @@ load_graphs_for_days, log_start, log_tqdm, + set_seed, ) @@ -109,6 +110,7 @@ def main_from_config(cfg): def main(cfg): + set_seed(cfg) log_start(__file__) multi_datasets = get_multi_datasets(cfg) diff --git a/pidsmaker/utils/utils.py b/pidsmaker/utils/utils.py index ace84aad..c5f1ec3e 100644 --- a/pidsmaker/utils/utils.py +++ b/pidsmaker/utils/utils.py @@ -653,11 +653,15 @@ def log_helper(label, dataset): def set_seed(cfg): - seed = cfg.seed - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - torch.backends.cudnn.benchmark = False + if cfg.detection.gnn_training.use_seed: + seed = 0 + random.seed(seed) + np.random.seed(seed) + + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False if cfg.detection.gnn_training.deterministic: torch.use_deterministic_algorithms(True, warn_only=True) diff --git a/scripts/run.sh b/scripts/run.sh index 4e3be119..a9a8c5af 100755 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -15,4 +15,4 @@ for arg in "$@"; do done # Execute the Python script with the passed arguments -PYTHONHASHSEED=0 nohup python ../pidsmaker/main.py $args --wandb & \ No newline at end of file +PYTHONHASHSEED=0 nohup python ../pidsmaker/main.py $args --wandb & From 9f7d8a6c5a91df5e00cef99418f6e4b98c41d86b Mon Sep 17 00:00:00 2001 From: tristan Date: Wed, 22 Oct 2025 20:50:17 -0700 Subject: [PATCH 24/33] revert ValueError or functional tests fail --- pidsmaker/config/pipeline.py | 3 +-- .../detection/evaluation_methods/evaluation_utils.py | 8 ++++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pidsmaker/config/pipeline.py b/pidsmaker/config/pipeline.py index 86e74bc6..1fd42d71 100644 --- a/pidsmaker/config/pipeline.py +++ b/pidsmaker/config/pipeline.py @@ -456,8 +456,7 @@ def get_yml_cfg(args): # Inits with default configurations cfg = get_default_cfg(args) - # Set seed for reproducibility - cfg.seed = args.seed + # Checks that all configurations are valid and merge yml file to cfg yml_file = get_yml_file(args.model) merge_cfg_and_check_syntax(cfg, yml_file) diff --git a/pidsmaker/detection/evaluation_methods/evaluation_utils.py b/pidsmaker/detection/evaluation_methods/evaluation_utils.py index 947a0f86..c9683d35 100644 --- a/pidsmaker/detection/evaluation_methods/evaluation_utils.py +++ b/pidsmaker/detection/evaluation_methods/evaluation_utils.py @@ -36,8 +36,12 @@ def classifier_evaluation(y_test, y_test_pred, scores): - if not sum(y_test) > 0: - raise ValueError("Cannot evaluate: no positive labels in test set") + labels_exist = sum(y_test) > 0 + if labels_exist: + tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel() + else: + log("WARNING: Computing confusion matrix failed.") + tn, fp, fn, tp = 1, 1, 1, 1 # only to not break tests tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel() eps = 1e-12 From 7c825fe0731411ba7e6c1587c5cebe4821a5e83c Mon Sep 17 00:00:00 2001 From: tristan Date: Wed, 22 Oct 2025 20:52:33 -0700 Subject: [PATCH 25/33] minor revert to still support logging args when using sweeps --- pidsmaker/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pidsmaker/main.py b/pidsmaker/main.py index 4e961710..1d5e9273 100644 --- a/pidsmaker/main.py +++ b/pidsmaker/main.py @@ -276,19 +276,19 @@ def run_pipeline_from_sweep(cfg): ) tags = args.tags.split(",") if args.tags != "" else [args.model] - cfg = get_yml_cfg(args) - wandb.init( mode=("online" if (args.wandb and args.tuning_mode == "none") else "disabled"), project=args.project, name=exp_name, tags=tags, - config=clean_cfg_for_log(cfg), ) if len(unknown_args) > 0: raise argparse.ArgumentTypeError(f"Unknown args {unknown_args}") + cfg = get_yml_cfg(args) + wandb.config.update(clean_cfg_for_log(cfg)) + main(cfg, project=args.project, exp=exp_name, sweep_id=args.sweep_id) wandb.finish() From b42bc608763d5c30e6fc3b616d8c3e97114d9161 Mon Sep 17 00:00:00 2001 From: tristan Date: Thu, 23 Oct 2025 04:16:29 +0000 Subject: [PATCH 26/33] fix tests --- pidsmaker/main.py | 11 ----------- pidsmaker/utils/utils.py | 3 +-- tests/test_framework.py | 3 +-- 3 files changed, 2 insertions(+), 15 deletions(-) diff --git a/pidsmaker/main.py b/pidsmaker/main.py index 1d5e9273..1ebcf4e5 100644 --- a/pidsmaker/main.py +++ b/pidsmaker/main.py @@ -5,9 +5,7 @@ import time from collections import defaultdict -import networkx as nx import torch -import torch_geometric import wandb from pidsmaker.config import ( @@ -45,17 +43,8 @@ from pidsmaker.triage import ( tracing, ) -from pidsmaker.utils.data_utils import CollatableTemporalData from pidsmaker.utils.utils import log, remove_underscore_keys, set_seed -torch.serialization.add_safe_globals( - [ - nx.classes.multidigraph.MultiDiGraph, - CollatableTemporalData, - torch_geometric.data.storage.GlobalStorage, - ] -) - def get_task_to_module(cfg): return { diff --git a/pidsmaker/utils/utils.py b/pidsmaker/utils/utils.py index c5f1ec3e..529fb546 100644 --- a/pidsmaker/utils/utils.py +++ b/pidsmaker/utils/utils.py @@ -18,8 +18,7 @@ from nltk.tokenize import word_tokenize from tqdm import tqdm -nltk.download("punkt_tab", quiet=True, download_dir="./nltk_data") -nltk.data.path.append("./nltk_data") +nltk.download("punkt", quiet=True) from pidsmaker.config import update_cfg_for_multi_dataset diff --git a/tests/test_framework.py b/tests/test_framework.py index 9a9e8a78..167af7e1 100644 --- a/tests/test_framework.py +++ b/tests/test_framework.py @@ -7,13 +7,12 @@ from pidsmaker import main from pidsmaker.config import ( - DEFAULT_ROOT_ARTIFACT_DIR, ENCODERS_CFG, get_runtime_required_args, get_yml_cfg, ) -TESTS_ARTIFACT_DIR = os.path.join(DEFAULT_ROOT_ARTIFACT_DIR, "tests/") +TESTS_ARTIFACT_DIR = os.path.join("./artifacts/", "tests/") def prepare_cfg( From d806a0a364ac78e66e29cae9330219cd9d232bf0 Mon Sep 17 00:00:00 2001 From: tristan Date: Thu, 23 Oct 2025 04:21:11 +0000 Subject: [PATCH 27/33] revert artifact_dir to /home/artifacts to map to Docker volume => ${ARTIFACTS_DIR:-/artifacts}:/home/artifacts --- pidsmaker/config/pipeline.py | 2 +- tests/test_framework.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pidsmaker/config/pipeline.py b/pidsmaker/config/pipeline.py index 1fd42d71..c17785ab 100644 --- a/pidsmaker/config/pipeline.py +++ b/pidsmaker/config/pipeline.py @@ -147,7 +147,7 @@ def get_runtime_required_args(return_unknown_args=False, args=None): ) parser.add_argument("--sweep_id", default="", help="ID of a wandb sweep for multi-agent runs") parser.add_argument( - "--artifact_dir", default="./artifacts/", help="Destination folder for generated files" + "--artifact_dir", default="/home/artifacts/", help="Destination folder for generated files" ) parser.add_argument( "--test_mode", diff --git a/tests/test_framework.py b/tests/test_framework.py index 167af7e1..e81645a1 100644 --- a/tests/test_framework.py +++ b/tests/test_framework.py @@ -12,7 +12,7 @@ get_yml_cfg, ) -TESTS_ARTIFACT_DIR = os.path.join("./artifacts/", "tests/") +TESTS_ARTIFACT_DIR = os.path.join("/home/artifacts/", "tests/") def prepare_cfg( From 156ac9c172b8edb1ef1d205baddc265045bb6e76 Mon Sep 17 00:00:00 2001 From: tristan Date: Thu, 23 Oct 2025 04:23:31 +0000 Subject: [PATCH 28/33] hotfix in the Docker install (not related directly to this PR but need to be merged anyway) --- Dockerfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 226bf401..82af74ea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -43,8 +43,7 @@ RUN conda install -y psycopg2 tqdm && \ wandb==0.16.6 chardet==5.2.0 nltk==3.8.1 igraph==0.11.5 \ cairocffi==1.7.0 wget==3.2 -RUN conda install -y pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 \ - pytorch-cuda=11.7 -c pytorch -c nvidia +RUN pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu117 RUN pip install torch_geometric==2.5.3 --no-cache-dir && \ pip install pyg_lib==0.2.0 torch_scatter==2.1.1 torch_sparse==0.6.17 \ From b6ee3f019b54d6bcfd0c8fbe7fca65a84ec511f5 Mon Sep 17 00:00:00 2001 From: tristan Date: Thu, 23 Oct 2025 04:34:11 +0000 Subject: [PATCH 29/33] fix tests finally --- pidsmaker/detection/evaluation_methods/evaluation_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pidsmaker/detection/evaluation_methods/evaluation_utils.py b/pidsmaker/detection/evaluation_methods/evaluation_utils.py index c9683d35..a8e7cc86 100644 --- a/pidsmaker/detection/evaluation_methods/evaluation_utils.py +++ b/pidsmaker/detection/evaluation_methods/evaluation_utils.py @@ -42,7 +42,6 @@ def classifier_evaluation(y_test, y_test_pred, scores): else: log("WARNING: Computing confusion matrix failed.") tn, fp, fn, tp = 1, 1, 1, 1 # only to not break tests - tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel() eps = 1e-12 fpr = fp / (fp + tn + eps) From 3f685ac3674e441da5a6372ec11f237807d5dac4 Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Thu, 23 Oct 2025 15:46:35 +0200 Subject: [PATCH 30/33] Simplify environment.yaml and make it perfectly match the docker environment --- scripts/environment.yaml | 225 ++++++++------------------------------- 1 file changed, 43 insertions(+), 182 deletions(-) diff --git a/scripts/environment.yaml b/scripts/environment.yaml index 5563e2ce..83acfd12 100644 --- a/scripts/environment.yaml +++ b/scripts/environment.yaml @@ -1,185 +1,46 @@ name: pids channels: - - conda-forge +- conda-forge dependencies: - - _libgcc_mutex=0.1=conda_forge - - _openmp_mutex=4.5=3_kmp_llvm - - bzip2=1.0.8=h4bc722e_7 - - ca-certificates=2025.6.15=hbd8a1cb_0 - - certifi=2025.6.15=pyhd8ed1ab_0 - - colorama=0.4.6=pyhd8ed1ab_1 - - cyrus-sasl=2.1.28=hd9c7081_0 - - icu=75.1=he02047a_0 - - keyutils=1.6.1=h166bdaf_0 - - krb5=1.21.3=h659f571_0 - - lcms2=2.17=h717163a_0 - - ld_impl_linux-64=2.43=h1423503_5 - - lerc=4.0.0=h0aef613_1 - - libdeflate=1.24=h86f0d12_0 - - libedit=3.1.20250104=pl5321h7949ede_0 - - libexpat=2.7.0=h5888daf_0 - - libffi=3.4.6=h2dba641_1 - - libfreetype=2.13.3=ha770c72_1 - - libfreetype6=2.13.3=h48d6fc4_1 - - libgcc=15.1.0=h767d61c_3 - - libgcc-ng=15.1.0=h69a702a_3 - - libgomp=15.1.0=h767d61c_3 - - libjpeg-turbo=3.1.0=hb9d3cd8_0 - - liblzma=5.8.1=hb9d3cd8_2 - - libnsl=2.0.1=hb9d3cd8_1 - - libntlm=1.8=hb9d3cd8_0 - - libpng=1.6.49=h943b412_0 - - libpq=17.5=h27ae623_0 - - libsqlite=3.50.1=h6cd9bfd_7 - - libstdcxx=15.1.0=h8f9b012_3 - - libstdcxx-ng=15.1.0=h4852527_3 - - libtiff=4.7.0=hf01ce69_5 - - libuuid=2.38.1=h0b41bf4_0 - - libwebp-base=1.5.0=h851e524_0 - - libxcb=1.17.0=h8a09558_0 - - libxcrypt=4.4.36=hd590300_1 - - libzlib=1.3.1=hb9d3cd8_2 - - llvm-openmp=20.1.7=h024ca30_0 - - ncurses=6.5=h2d0b736_3 - - openjpeg=2.5.3=h5fbd93e_0 - - openldap=2.6.10=he970967_0 - - openssl=3.5.0=h7b32b05_1 - - pillow=11.2.1=py39h15c0740_0 - - pip=25.1.1=pyh8b19718_0 - - psycopg2=2.9.10=py39h2bc273e_1 - - pthread-stubs=0.4=hb9d3cd8_1002 - - python=3.9.23=hc30ae73_0_cpython - - python_abi=3.9=7_cp39 - - readline=8.2=h8c095d6_2 - - tk=8.6.13=noxft_hd72426e_102 - - tqdm=4.67.1=pyhd8ed1ab_1 - - wheel=0.45.1=pyhd8ed1ab_1 - - xorg-libxau=1.0.12=hb9d3cd8_0 - - xorg-libxdmcp=1.1.5=hb9d3cd8_0 - - zstd=1.5.7=hb8e6e7a_2 - - pip: - - aiohappyeyeballs==2.6.1 - - aiohttp==3.12.13 - - aiosignal==1.3.2 - - appdirs==1.4.4 - - async-timeout==5.0.1 - - attrs==25.3.0 - - babel==2.17.0 - - backrefs==5.9 - - beautifulsoup4==4.13.4 - - cairocffi==1.7.0 - - cfgv==3.4.0 - - chardet==5.2.0 - - charset-normalizer==3.4.2 - - click==8.1.8 - - contourpy==1.3.0 - - coverage==7.9.1 - - cycler==0.12.1 - - distlib==0.3.9 - - docker-pycreds==0.4.0 - - exceptiongroup==1.3.0 - - filelock==3.18.0 - - fonttools==4.58.4 - - frozenlist==1.7.0 - - fsspec==2025.5.1 - - gdown==5.2.0 - - gensim==4.3.1 - - ghp-import==2.1.0 - - gitdb==4.0.12 - - gitpython==3.1.44 - - graphviz==0.20.1 - - h5py==3.14.0 - - identify==2.6.12 - - idna==3.10 - - igraph==0.11.5 - - importlib-metadata==8.7.0 - - importlib-resources==6.5.2 - - iniconfig==2.1.0 - - jinja2==3.1.6 - - joblib==1.5.1 - - kiwisolver==1.4.7 - - markdown==3.8.2 - - markupsafe==3.0.2 - - matplotlib==3.8.4 - - mergedeep==1.3.4 - - mkdocs==1.6.1 - - mkdocs-get-deps==0.2.0 - - mkdocs-glightbox==0.4.0 - - mkdocs-material==9.6.12 - - mkdocs-material-extensions==1.3.1 - - mpmath==1.3.0 - - multidict==6.5.1 - - networkx==2.8.7 - - nltk==3.8.1 - - nodeenv==1.9.1 - - numpy==1.26.4 - - nvidia-cublas-cu12==12.6.4.1 - - nvidia-cuda-cupti-cu12==12.6.80 - - nvidia-cuda-nvrtc-cu12==12.6.77 - - nvidia-cuda-runtime-cu12==12.6.77 - - nvidia-cudnn-cu12==9.5.1.17 - - nvidia-cufft-cu12==11.3.0.4 - - nvidia-cufile-cu12==1.11.1.6 - - nvidia-curand-cu12==10.3.7.77 - - nvidia-cusolver-cu12==11.7.1.2 - - nvidia-cusparse-cu12==12.5.4.2 - - nvidia-cusparselt-cu12==0.6.3 - - nvidia-nccl-cu12==2.26.2 - - nvidia-nvjitlink-cu12==12.6.85 - - nvidia-nvtx-cu12==12.6.77 - - packaging==25.0 - - paginate==0.5.7 - - pandas==2.2.2 - - pathspec==0.12.1 - - platformdirs==4.3.8 - - pluggy==1.6.0 - - pre-commit==4.2.0 - - propcache==0.3.2 - - protobuf==4.25.8 - - psutil==7.0.0 - - pyg-lib==0.4.0+pt27cu126 - - pygments==2.19.2 - - pymdown-extensions==10.16 - - pyparsing==3.2.3 - - pytest==8.3.5 - - pytest-cov==6.1.1 - - python-dateutil==2.9.0.post0 - - pytz==2024.1 - - pyyaml==6.0.2 - - pyyaml-env-tag==1.1 - - regex==2024.11.6 - - requests==2.32.4 - - scikit-learn==1.2.0 - - scipy==1.10.1 - - sentry-sdk==2.31.0 - - setproctitle==1.3.6 - - setuptools==61.0.0 - - six==1.17.0 - - smart-open==7.1.0 - - smmap==5.0.2 - - soupsieve==2.7 - - sympy==1.14.0 - - texttable==1.7.0 - - threadpoolctl==3.6.0 - - tomli==2.2.1 - - torch==2.7.1 - - torch-cluster==1.6.3+pt27cu126 - - torch-geometric==2.5.3 - - torch-scatter==2.1.2+pt27cu126 - - torch-sparse==0.6.18+pt27cu126 - - torch-spline-conv==1.2.2+pt27cu126 - - torchaudio==2.7.1 - - torchvision==0.22.1 - - triton==3.3.1 - - typing-extensions==4.14.0 - - tzdata==2025.2 - - urllib3==2.5.0 - - virtualenv==20.31.2 - - wandb==0.16.6 - - watchdog==6.0.0 - - wget==3.2 - - wrapt==1.17.2 - - xxhash==3.2.0 - - yacs==0.1.8 - - yarl==1.20.1 - - zipp==3.23.0 +- pip=25.1.1 +- python=3.9.23 +- python_abi=3.9 +- psycopg2 +- tqdm +- pip: + - --extra-index-url https://download.pytorch.org/whl/cu117 + - -f https://data.pyg.org/whl/torch-1.13.0+cu117.html + - torch==1.13.1+cu117 + - torchvision==0.14.1+cu117 + - torchaudio==0.13.1 + - scikit-learn==1.2.0 + - networkx==2.8.7 + - xxhash==3.2.0 + - graphviz==0.20.1 + - psutil + - scipy==1.10.1 + - matplotlib==3.8.4 + - wandb==0.16.6 + - chardet==5.2.0 + - nltk==3.8.1 + - igraph==0.11.5 + - cairocffi==1.7.0 + - wget==3.2 + - torch_geometric==2.5.3 + - pyg_lib==0.2.0 + - torch_scatter==2.1.1 + - torch_sparse==0.6.17 + - torch_cluster==1.6.1 + - torch_spline_conv==1.2.2 + - gensim==4.3.1 + - pytz==2024.1 + - pandas==2.2.2 + - yacs==0.1.8 + - numpy==1.26.4 + - gdown==5.2.0 + - pytest==8.3.5 + - pytest-cov==6.1.1 + - pre-commit==4.2.0 + - setuptools==61.0 + - mkdocs-material==9.6.12 + - mkdocs-glightbox==0.4.0 From 0ef0f8078b738a79bedb4f544a575f46805dc417 Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Thu, 23 Oct 2025 16:27:50 +0200 Subject: [PATCH 31/33] Add support for Apptainer (new name for Singularity) as well --- .gitignore | 9 +++-- scripts/Makefile | 18 ++++++--- scripts/postgres-start.sh | 75 ++++++++++++++++++++++---------------- scripts/postgres-status.sh | 24 ++++++++---- scripts/postgres-stop.sh | 24 ++++++++---- 5 files changed, 95 insertions(+), 55 deletions(-) diff --git a/.gitignore b/.gitignore index 375cde6d..0832ad6b 100644 --- a/.gitignore +++ b/.gitignore @@ -177,9 +177,10 @@ postgres_lock/ *.def # Postgres directories -postgres_config/ -postgres_run/ -postgres_log/ +postgres_config +postgres_run +postgres_log +postgres_data # tokenizer data -nltk_data/ +nltk_data diff --git a/scripts/Makefile b/scripts/Makefile index b395519b..387a3948 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -1,4 +1,4 @@ -# Makefile for Singularity PostgreSQL management +# Makefile for Singularity/Apptainer PostgreSQL management .PHONY: up down status load-dumps full-setup logs clean help @@ -25,11 +25,18 @@ reset: clean up app-build: @echo "Building PIDSMaker container..." - @singularity build pidsmaker.sif pidsmaker.def || echo "Build failed - check if you have fakeroot access" + @if command -v apptainer &> /dev/null; then \ + apptainer build pidsmaker.sif pidsmaker.def || echo "Build failed - check if you have fakeroot access"; \ + elif command -v singularity &> /dev/null; then \ + singularity build pidsmaker.sif pidsmaker.def || echo "Build failed - check if you have fakeroot access"; \ + else \ + echo "ERROR: Neither apptainer nor singularity found"; exit 1; \ + fi app-run: up @echo "Running PIDSMaker application..." - @singularity run --nv \ + @CONTAINER_CMD=$$(command -v apptainer &> /dev/null && echo "apptainer" || echo "singularity"); \ + $$CONTAINER_CMD run --nv \ --env DB_HOST=localhost \ --env DOCKER_PORT=5432 \ --env DB_USER=postgres \ @@ -39,9 +46,10 @@ app-run: up load-dumps: up @echo "Loading database dumps from inside container..." - @if [ -f "./load_dumps.sh" ]; then \ + @CONTAINER_CMD=$$(command -v apptainer &> /dev/null && echo "apptainer" || echo "singularity"); \ + if [ -f "./load_dumps.sh" ]; then \ echo "Found load_dumps.sh, executing inside container..."; \ - singularity exec instance://postgres_instance /scripts/load_dumps.sh; \ + $$CONTAINER_CMD exec instance://postgres_instance /scripts/load_dumps.sh; \ else \ echo "Error: ./load_dumps.sh not found"; \ exit 1; \ diff --git a/scripts/postgres-start.sh b/scripts/postgres-start.sh index 7d00cd8e..03d75cbf 100755 --- a/scripts/postgres-start.sh +++ b/scripts/postgres-start.sh @@ -1,9 +1,26 @@ #!/bin/bash -# PostgreSQL startup script for Singularity +# PostgreSQL startup script for Singularity/Apptainer set -e +# Detect which container runtime is available +if command -v apptainer &> /dev/null; then + CONTAINER_CMD="apptainer" + export APPTAINER_TMPDIR="${TMPDIR:-/tmp}/apptainer-${USER}" + export APPTAINER_CACHEDIR="${HOME}/.apptainer/cache" + export APPTAINER_SESSIONDIR="${TMPDIR:-/tmp}/apptainer-sessions-${USER}" + mkdir -p "$APPTAINER_TMPDIR" "$APPTAINER_CACHEDIR" "$APPTAINER_SESSIONDIR" +elif command -v singularity &> /dev/null; then + CONTAINER_CMD="singularity" + export SINGULARITY_TMPDIR="${TMPDIR:-/tmp}/singularity-${USER}" + export SINGULARITY_CACHEDIR="${HOME}/.singularity/cache" + mkdir -p "$SINGULARITY_TMPDIR" "$SINGULARITY_CACHEDIR" +else + echo "ERROR: Neither apptainer nor singularity found in PATH" + exit 1 +fi + # Configuration POSTGRES_IMAGE="postgres.sif" POSTGRES_INSTANCE="postgres_instance" @@ -17,12 +34,12 @@ GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' # No Color -echo -e "${YELLOW}Starting PostgreSQL with Singularity...${NC}" +echo -e "${YELLOW}Starting PostgreSQL with ${CONTAINER_CMD}...${NC}" # Check if postgres.sif exists if [ ! -f "$POSTGRES_IMAGE" ]; then echo -e "${YELLOW}PostgreSQL image not found. Pulling from Docker Hub...${NC}" - singularity pull $POSTGRES_IMAGE docker://postgres:17 + $CONTAINER_CMD pull $POSTGRES_IMAGE docker://postgres:17 fi # Create necessary directories @@ -37,48 +54,42 @@ if [ ! -d "$INPUT_DIR" ]; then fi # Check if instance already exists -if singularity instance list | grep -q "$POSTGRES_INSTANCE"; then +if $CONTAINER_CMD instance list | grep -q "$POSTGRES_INSTANCE"; then echo -e "${YELLOW}PostgreSQL instance $POSTGRES_INSTANCE already exists${NC}" # Check if it's responsive - if singularity exec instance://$POSTGRES_INSTANCE pg_isready -h localhost -U postgres > /dev/null 2>&1; then + if $CONTAINER_CMD exec instance://$POSTGRES_INSTANCE pg_isready -h localhost -U postgres > /dev/null 2>&1; then echo -e "${GREEN}PostgreSQL instance is already running and responsive${NC}" exit 0 else echo -e "${YELLOW}Instance exists but not responsive, stopping it...${NC}" - singularity instance stop $POSTGRES_INSTANCE + $CONTAINER_CMD instance stop $POSTGRES_INSTANCE sleep 2 fi fi # Check if any other postgres processes are running -if pgrep -f "singularity.*postgres" > /dev/null; then +if pgrep -f "${CONTAINER_CMD}.*postgres" > /dev/null; then echo -e "${YELLOW}Other PostgreSQL processes detected, cleaning up...${NC}" - pkill -f "singularity.*postgres" || true + pkill -f "${CONTAINER_CMD}.*postgres" || true sleep 2 fi -# Set environment variables -export SINGULARITYENV_POSTGRES_PASSWORD=postgres -export SINGULARITYENV_POSTGRES_USER=postgres -export SINGULARITYENV_POSTGRES_DB=postgres +# Set environment variables (works for both singularity and apptainer) +if [ "$CONTAINER_CMD" = "apptainer" ]; then + export APPTAINERENV_POSTGRES_PASSWORD=postgres + export APPTAINERENV_POSTGRES_USER=postgres + export APPTAINERENV_POSTGRES_DB=postgres +else + export SINGULARITYENV_POSTGRES_PASSWORD=postgres + export SINGULARITYENV_POSTGRES_USER=postgres + export SINGULARITYENV_POSTGRES_DB=postgres +fi # Prepare bind mounts - only bind if files/directories exist BIND_MOUNTS="--bind $DATA_DIR:/var/lib/postgresql/data" BIND_MOUNTS="$BIND_MOUNTS --bind $RUN_DIR:/var/run/postgresql" BIND_MOUNTS="$BIND_MOUNTS --bind $LOG_DIR:/var/log" - -# Add optional bind mounts if they exist -if [ -f "./postgres/init-create-empty-databases.sh" ]; then - BIND_MOUNTS="$BIND_MOUNTS --bind ./postgres/init-create-empty-databases.sh:/docker-entrypoint-initdb.d/init-create-empty-databases.sh" -else - echo -e "${YELLOW}Warning: ./postgres/init-create-empty-databases.sh not found, skipping${NC}" -fi - -if [ -d "./scripts" ]; then - BIND_MOUNTS="$BIND_MOUNTS --bind ./scripts:/scripts" -else - echo -e "${YELLOW}Warning: ./scripts directory not found, skipping${NC}" -fi +BIND_MOUNTS="$BIND_MOUNTS --bind ./:/scripts" # Always bind INPUT_DIR BIND_MOUNTS="$BIND_MOUNTS --bind $INPUT_DIR:/data" @@ -91,14 +102,14 @@ fi echo -e "${YELLOW}Starting PostgreSQL instance...${NC}" echo -e "${YELLOW}Using INPUT_DIR: $INPUT_DIR${NC}" -singularity instance start $BIND_MOUNTS $POSTGRES_IMAGE $POSTGRES_INSTANCE +$CONTAINER_CMD instance start $BIND_MOUNTS $POSTGRES_IMAGE $POSTGRES_INSTANCE # Start PostgreSQL inside the instance echo -e "${YELLOW}Starting PostgreSQL server inside instance...${NC}" -singularity exec instance://$POSTGRES_INSTANCE bash -c "docker-entrypoint.sh postgres &" +$CONTAINER_CMD exec instance://$POSTGRES_INSTANCE bash -c "docker-entrypoint.sh postgres &" # Get the PID of the instance (optional, for compatibility) -INSTANCE_PID=$(pgrep -f "singularity.*$POSTGRES_INSTANCE" | head -1) +INSTANCE_PID=$(pgrep -f "${CONTAINER_CMD}.*$POSTGRES_INSTANCE" | head -1) if [ -n "$INSTANCE_PID" ]; then echo $INSTANCE_PID > postgres.pid fi @@ -106,9 +117,9 @@ fi # Wait for PostgreSQL to be ready echo -e "${YELLOW}Waiting for PostgreSQL to start...${NC}" for i in {1..30}; do - if singularity exec instance://$POSTGRES_INSTANCE pg_isready -h localhost -U postgres > /dev/null 2>&1; then + if $CONTAINER_CMD exec instance://$POSTGRES_INSTANCE pg_isready -h localhost -U postgres > /dev/null 2>&1; then echo -e "${GREEN}PostgreSQL is ready!${NC}" - echo -e "${GREEN}Connection: singularity exec instance://$POSTGRES_INSTANCE psql -h localhost -U postgres${NC}" + echo -e "${GREEN}Connection: $CONTAINER_CMD exec instance://$POSTGRES_INSTANCE psql -h localhost -U postgres${NC}" echo -e "${GREEN}Instance: $POSTGRES_INSTANCE${NC}" exit 0 fi @@ -117,5 +128,5 @@ for i in {1..30}; do done echo -e "${RED}PostgreSQL failed to start within 60 seconds${NC}" -singularity instance stop $POSTGRES_INSTANCE 2>/dev/null || true -exit 1 \ No newline at end of file +$CONTAINER_CMD instance stop $POSTGRES_INSTANCE 2>/dev/null || true +exit 1 diff --git a/scripts/postgres-status.sh b/scripts/postgres-status.sh index e3669616..8be6b04e 100755 --- a/scripts/postgres-status.sh +++ b/scripts/postgres-status.sh @@ -1,6 +1,16 @@ #!/bin/bash -# PostgreSQL status script for Singularity +# PostgreSQL status script for Singularity/Apptainer + +# Detect which container runtime is available +if command -v apptainer &> /dev/null; then + CONTAINER_CMD="apptainer" +elif command -v singularity &> /dev/null; then + CONTAINER_CMD="singularity" +else + echo "ERROR: Neither apptainer nor singularity found in PATH" + exit 1 +fi # Colors for output RED='\033[0;31m' @@ -31,20 +41,20 @@ if [ ! -f postgres.sif ]; then exit 1 fi -if singularity exec postgres.sif pg_isready -h localhost -U postgres > /dev/null 2>&1; then +if $CONTAINER_CMD exec postgres.sif pg_isready -h localhost -U postgres > /dev/null 2>&1; then echo -e "${GREEN}✓ PostgreSQL is accepting connections${NC}" - echo -e "${GREEN} Connection: singularity exec postgres.sif psql -h localhost -U postgres${NC}" + echo -e "${GREEN} Connection: ${CONTAINER_CMD} exec postgres.sif psql -h localhost -U postgres${NC}" POSTGRES_RUNNING=true # Show database list echo -e "${YELLOW}Databases:${NC}" - singularity exec postgres.sif psql -h localhost -U postgres -c "\l" 2>/dev/null | \ + $CONTAINER_CMD exec postgres.sif psql -h localhost -U postgres -c "\l" 2>/dev/null | \ grep -v template | grep -v "^-" | grep -v "^(" | grep -v "Name.*Owner" | \ grep -v "^\s*$" | head -10 # Show PostgreSQL version echo -e "${YELLOW}Version:${NC}" - singularity exec postgres.sif psql -h localhost -U postgres -c "SELECT version();" -t 2>/dev/null | head -1 + $CONTAINER_CMD exec postgres.sif psql -h localhost -U postgres -c "SELECT version();" -t 2>/dev/null | head -1 else echo -e "${RED}✗ PostgreSQL is not accepting connections${NC}" @@ -53,10 +63,10 @@ fi # Method 3: Check process list as fallback if [ "$POSTGRES_RUNNING" = false ]; then # Check for any postgres-related processes with more flexible patterns - if pgrep -f "postgres" > /dev/null || pgrep -f "singularity.*postgres" > /dev/null; then + if pgrep -f "postgres" > /dev/null || pgrep -f "${CONTAINER_CMD}.*postgres" > /dev/null; then echo -e "${YELLOW}! Found postgres-related process but cannot connect${NC}" echo -e "${YELLOW} Process list:${NC}" - ps aux | grep -E "(postgres|singularity)" | grep -v grep | head -5 + ps aux | grep -E "(postgres|${CONTAINER_CMD})" | grep -v grep | head -5 else echo -e "${RED}✗ No PostgreSQL processes found${NC}" fi diff --git a/scripts/postgres-stop.sh b/scripts/postgres-stop.sh index 729e8e25..6455df4e 100755 --- a/scripts/postgres-stop.sh +++ b/scripts/postgres-stop.sh @@ -1,6 +1,16 @@ #!/bin/bash -# PostgreSQL shutdown script for Singularity +# PostgreSQL shutdown script for Singularity/Apptainer + +# Detect which container runtime is available +if command -v apptainer &> /dev/null; then + CONTAINER_CMD="apptainer" +elif command -v singularity &> /dev/null; then + CONTAINER_CMD="singularity" +else + echo "ERROR: Neither apptainer nor singularity found in PATH" + exit 1 +fi # Colors for output RED='\033[0;31m' @@ -12,10 +22,10 @@ echo -e "${YELLOW}Stopping PostgreSQL...${NC}" STOPPED=false -# Method 1: Stop Singularity instance if it exists -if singularity instance list | grep -q "postgres_instance"; then - echo -e "${YELLOW}Stopping Singularity instance: postgres_instance${NC}" - singularity instance stop postgres_instance +# Method 1: Stop instance if it exists +if $CONTAINER_CMD instance list | grep -q "postgres_instance"; then + echo -e "${YELLOW}Stopping ${CONTAINER_CMD} instance: postgres_instance${NC}" + $CONTAINER_CMD instance stop postgres_instance STOPPED=true fi @@ -44,8 +54,8 @@ if [ -f postgres.pid ]; then rm postgres.pid fi -# Method 3: Fallback - kill any singularity postgres processes -if pkill -f "singularity.*postgres"; then +# Method 3: Fallback - kill any container postgres processes +if pkill -f "${CONTAINER_CMD}.*postgres"; then echo -e "${YELLOW}Killed remaining PostgreSQL processes${NC}" STOPPED=true fi From 4d52f2e3b18cd3d94df9a81b852537fa47bc0ae7 Mon Sep 17 00:00:00 2001 From: Lorenzo Guerra Date: Thu, 23 Oct 2025 17:07:48 +0200 Subject: [PATCH 32/33] =?UTF-8?q?Important=20fix:=20Velox=E2=80=99s=20Line?= =?UTF-8?q?arEncoder=20now=20correctly=20receives=20a=20tuple=20of=20the?= =?UTF-8?q?=20x=5Fsrc=20and=20x=5Fdst=20input=20tensors.=20Previously,=20x?= =?UTF-8?q?=5Fis=5Ftuple=20was=20always=20False=20by=20default,=20causing?= =?UTF-8?q?=20x=5Fsrc=20and=20x=5Fdst=20to=20be=20mixed=20up=20by=20torch.?= =?UTF-8?q?scatter.=20This=20led=20to=20the=20node=20pairs=20being=20merge?= =?UTF-8?q?d,=20resulting=20in=20a=20single=20tensor=20of=20shape=20(N,=20?= =?UTF-8?q?d)=20inside=20LinearEncoder=20instead=20of=20two=20separate=20t?= =?UTF-8?q?ensors.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config/default.yml | 1 + config/velox.yml | 1 + pidsmaker/config/config.py | 1 + pidsmaker/encoders/linear_encoder.py | 3 ++- pidsmaker/model.py | 13 ++++++++----- pidsmaker/utils/data_utils.py | 20 +++++++++++--------- 6 files changed, 24 insertions(+), 15 deletions(-) diff --git a/config/default.yml b/config/default.yml index 544bd512..09910c33 100644 --- a/config/default.yml +++ b/config/default.yml @@ -92,6 +92,7 @@ detection: activation: prelu custom_mlp: architecture_str: none + x_is_tuple: False decoder: predict_edge_type: decoder: edge_mlp diff --git a/config/velox.yml b/config/velox.yml index 372f0b23..98892957 100644 --- a/config/velox.yml +++ b/config/velox.yml @@ -9,3 +9,4 @@ detection: gnn_training: encoder: used_methods: none # uses only a linear layer + x_is_tuple: True diff --git a/pidsmaker/config/config.py b/pidsmaker/config/config.py index b80d3f85..d0b05caa 100644 --- a/pidsmaker/config/config.py +++ b/pidsmaker/config/config.py @@ -730,6 +730,7 @@ def __init__(self, type, vals: list = None, desc: str = None): vals=AND(list(ENCODERS_CFG.keys())), desc="First part of the neural network. Usually GNN encoders to capture complex patterns.", ), + "x_is_tuple": Arg(bool), **ENCODERS_CFG, }, "decoder": { diff --git a/pidsmaker/encoders/linear_encoder.py b/pidsmaker/encoders/linear_encoder.py index 5714993e..2f5dbe10 100644 --- a/pidsmaker/encoders/linear_encoder.py +++ b/pidsmaker/encoders/linear_encoder.py @@ -8,7 +8,8 @@ def __init__(self, in_dim, out_dim, dropout=0.0): self.dropout = nn.Dropout(dropout) def forward(self, x, *args, **kwargs): - if isinstance(x, tuple): + # Handle both tuples and lists (PyG batching may convert tuples to lists) + if isinstance(x, (tuple, list)): h = self.dropout(self.lin1(x[0])), self.dropout(self.lin1(x[1])) else: h = self.dropout(self.lin1(x)) diff --git a/pidsmaker/model.py b/pidsmaker/model.py index 0a09eb5f..9eb7a54a 100644 --- a/pidsmaker/model.py +++ b/pidsmaker/model.py @@ -129,11 +129,14 @@ def gather_h(self, batch, res): h_dst = res.get("h_dst", None) if None in [h_src, h_dst]: - h_src, h_dst = ( - (h[batch.edge_index[0]], h[batch.edge_index[1]]) - if isinstance(h, torch.Tensor) - else h - ) + if isinstance(h, torch.Tensor): + # h is a single tensor with node embeddings - index by edge_index + h_src, h_dst = h[batch.edge_index[0]], h[batch.edge_index[1]] + elif isinstance(h, (tuple, list)): + # h is (h_src_nodes, h_dst_nodes) with separate node embeddings - index each + h_src, h_dst = h[0][batch.edge_index[0]], h[1][batch.edge_index[1]] + else: + h_src, h_dst = h return h, h_src, h_dst diff --git a/pidsmaker/utils/data_utils.py b/pidsmaker/utils/data_utils.py index 0b914a76..16dfa835 100644 --- a/pidsmaker/utils/data_utils.py +++ b/pidsmaker/utils/data_utils.py @@ -522,7 +522,13 @@ def run_reindexing_preprocessing(datasets, graph_reindexer, device, cfg): log_dataset_stats(datasets) # By default we only have x_src and x_dst of shape (E, d), here we create x of shape (N, d) use_tgn = "tgn" in cfg.detection.gnn_training.encoder.used_methods - reindex_graphs(datasets, graph_reindexer, device, use_tgn) + reindex_graphs( + datasets, + graph_reindexer, + device, + use_tgn, + x_is_tuple=cfg.detection.gnn_training.encoder.x_is_tuple, + ) return datasets @@ -782,7 +788,7 @@ def node_features_reshape(self, edge_index, x_src, x_dst, max_num_node=None, x_i scatter(x_dst, edge_index[1], out=output, dim=0, reduce="mean") x_dst_result = output.clone() - return x_src_result[:max_num_node], x_dst_result[:max_num_node] + return (x_src_result[:max_num_node], x_dst_result[:max_num_node]) else: if self.fix_buggy_graph_reindexer: output = output.clone() @@ -811,15 +817,11 @@ def reindex_graph(self, data, x_is_tuple=False, use_tgn=False): data.edge_index, data.x_src, data.x_dst, x_is_tuple=x_is_tuple ) data.original_n_id = n_id + data.x = x if not use_tgn: data.src, data.dst = edge_index[0], edge_index[1] - if x_is_tuple: - data.x_src, data.x_dst = x - else: - data.x = x - data.node_type, *_ = self._reindex_graph( data.edge_index, data.node_type_src, data.node_type_dst, x_is_tuple=False ) @@ -912,10 +914,10 @@ def load_model(model, path: str, cfg, map_location=None): return model -def reindex_graphs(datasets, graph_reindexer, device, use_tgn): +def reindex_graphs(datasets, graph_reindexer, device, use_tgn, x_is_tuple=False): for dataset in datasets: for data_list in dataset: for batch in log_tqdm(data_list, desc="Reindexing graphs"): batch.to(device) - graph_reindexer.reindex_graph(batch, use_tgn=use_tgn) + graph_reindexer.reindex_graph(batch, use_tgn=use_tgn, x_is_tuple=x_is_tuple) batch.to("cpu") From b68ef86eb649d8588a6f22609cdd394e5cadc814 Mon Sep 17 00:00:00 2001 From: tristan Date: Fri, 24 Oct 2025 00:37:44 +0000 Subject: [PATCH 33/33] add missing desc --- pidsmaker/config/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pidsmaker/config/config.py b/pidsmaker/config/config.py index d0b05caa..15c27b49 100644 --- a/pidsmaker/config/config.py +++ b/pidsmaker/config/config.py @@ -730,7 +730,7 @@ def __init__(self, type, vals: list = None, desc: str = None): vals=AND(list(ENCODERS_CFG.keys())), desc="First part of the neural network. Usually GNN encoders to capture complex patterns.", ), - "x_is_tuple": Arg(bool), + "x_is_tuple": Arg(bool, desc="Whether to consider nodes differently when being source or destination."), **ENCODERS_CFG, }, "decoder": {