nf-core · dbaku42 · Apr 29, 2026 · May 4, 2026 · May 4, 2026 · May 4, 2026
diff --git a/modules/nf-core/custom/clustering/environment.yml b/modules/nf-core/custom/clustering/environment.yml
@@ -0,0 +1,11 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - conda-forge::numpy=2.4.4
+  - conda-forge::pandas=3.0.3
+  - conda-forge::python=3.12.13
+  - conda-forge::pyyaml=6.0.3
+  - conda-forge::scikit-learn=1.8.0
diff --git a/modules/nf-core/custom/clustering/main.nf b/modules/nf-core/custom/clustering/main.nf
@@ -0,0 +1,42 @@
+process CUSTOM_CLUSTERING {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ?
+        'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/a3/a37807bdaf3edad30a2b212962b6af381bc10381a80c40efb2bb07f6ee43032f/data' :
+        'community.wave.seqera.io/library/numpy_pandas_python_pyyaml_scikit-learn:c500ceb82d3d7606' }"
+
+    input:
+    tuple val(meta), path(eigenvec)
+    val algorithm
+    val n_clusters
+    val dbscan_eps
+    val dbscan_min_samples
+
+    output:
+    tuple val(meta), path("*.clusters.csv")        , emit: clusters
+    tuple val(meta), path("*.clustering_info.json"), emit: info
+    path "versions.yml"                            , emit: versions, topic: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    template 'clustering.py'
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch ${prefix}.clusters.csv
+    touch ${prefix}.clustering_info.json
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python3 --version | sed 's/Python //')
+        numpy: \$(python3 -c "from importlib.metadata import version; print(version('numpy'))")
+        pandas: \$(python3 -c "from importlib.metadata import version; print(version('pandas'))")
+        scikit-learn: \$(python3 -c "from importlib.metadata import version; print(version('scikit-learn'))")
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/custom/clustering/meta.yml b/modules/nf-core/custom/clustering/meta.yml
@@ -0,0 +1,77 @@
+name: "CUSTOM_CLUSTERING"
+description: "Performs KMeans or DBSCAN clustering on principal components from PLINK2
+  --pca"
+keywords:
+  - clustering
+  - pca
+  - kmeans
+  - dbscan
+  - principal-components
+tools:
+  - "scikit-learn":
+      description: "Machine learning library for clustering"
+      homepage: "https://scikit-learn.org/"
+      documentation: "https://scikit-learn.org/stable/modules/clustering.html"
+      licence:
+        - "BSD-3-Clause"
+      identifier: ""
+input:
+  - - meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. `[ id:'sample1' ]`
+    - eigenvec:
+        type: file
+        description: PLINK2 .eigenvec file generated by --pca
+        pattern: "*.eigenvec"
+        ontologies: []
+  - algorithm:
+      type: string
+      description: Clustering algorithm to use (kmeans or dbscan)
+  - n_clusters:
+      type: integer
+      description: Number of clusters for KMeans
+  - dbscan_eps:
+      type: float
+      description: Epsilon parameter for DBSCAN
+  - dbscan_min_samples:
+      type: integer
+      description: Minimum samples parameter for DBSCAN
+output:
+  clusters:
+    - - meta:
+          type: map
+          description: Groovy Map containing sample information
+      - "*.clusters.csv":
+          type: file
+          description: CSV file with sample_id and assigned cluster
+          pattern: "*.clusters.csv"
+          ontologies:
+            - edam: http://edamontology.org/format_3752
+  info:
+    - - meta:
+          type: map
+          description: Groovy Map containing sample information
+      - "*.clustering_info.json":
+          type: file
+          description: JSON file with clustering parameters and statistics
+          pattern: "*.clustering_info.json"
+          ontologies:
+            - edam: http://edamontology.org/format_3464
+  versions:
+    - "versions.yml":
+        type: file
+        description: File containing software versions
+        pattern: "versions.yml"
+        ontologies:
+          - edam: http://edamontology.org/format_3750
+topics:
+  versions:
+    - versions.yml:
+        type: string
+        description: The name of the process
+authors:
+  - "@dbaku42"
+maintainers:
+  - "@dbaku42"
diff --git a/modules/nf-core/custom/clustering/templates/clustering.py b/modules/nf-core/custom/clustering/templates/clustering.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+
+import json
+import platform
+
+import numpy as np
+import pandas as pd
+import sklearn
+import yaml
+from sklearn.cluster import DBSCAN, KMeans
+
+
+def parse_eigenvec(path):
+    """Parse a PLINK2 .eigenvec file into (sample_ids: pd.Series, pcs: np.ndarray).
+
+    Accepts the FID/IID and IID-only header layouts PLINK2 emits, plus the
+    leading '#' on the header line. Sample IDs are read from the IID column.
+    """
+    df = pd.read_csv(path, sep=r"\\s+", engine="python")
+    df.columns = [c.lstrip("#") for c in df.columns]
+    cols_upper = [c.upper() for c in df.columns]
+    if cols_upper[:2] == ["FID", "IID"]:
+        id_cols = df.columns[:2]
+    elif cols_upper[:1] == ["IID"]:
+        id_cols = df.columns[:1]
+    else:
+        raise ValueError(f"eigenvec file missing IID header: {list(df.columns)}")
+    sample_ids = df["IID"].astype(str)
+    pcs = df.drop(columns=id_cols).to_numpy(dtype=float)
+    return sample_ids, pcs
+
+
+def main():
+    eigenvec = "$eigenvec"
+    algorithm = "$algorithm"
+    n_clusters = int("$n_clusters")
+    dbscan_eps = float("$dbscan_eps")
+    dbscan_min_samples = int("$dbscan_min_samples")
+    prefix = "${task.ext.prefix ?: meta.id}"
+
+    sample_ids, x = parse_eigenvec(eigenvec)
+
+    if algorithm == "kmeans":
+        model = KMeans(n_clusters=n_clusters, init="random", n_init=100, random_state=42)
+        labels = model.fit_predict(x)
+        info = {
+            "algorithm": "kmeans",
+            "k": n_clusters,
+            "inertia": float(model.inertia_),
+        }
+    elif algorithm == "dbscan":
+        model = DBSCAN(eps=dbscan_eps, min_samples=dbscan_min_samples)
+        labels = model.fit_predict(x)
+        info = {
+            "algorithm": "dbscan",
+            "eps": dbscan_eps,
+            "min_samples": dbscan_min_samples,
+            "n_clusters_found": len(set(labels) - {-1}),
+            "n_noise": int(np.sum(labels == -1)),
+        }
+    else:
+        raise ValueError(f"Unknown algorithm '{algorithm}' (expected 'kmeans' or 'dbscan')")
+
+    info |= {"n_samples": int(x.shape[0]), "n_features": int(x.shape[1])}
+
+    pd.DataFrame({"sample_id": sample_ids, "cluster": labels}).to_csv(f"{prefix}.clusters.csv", index=False)
+    with open(f"{prefix}.clustering_info.json", "w") as fh:
+        json.dump(info, fh, indent=2)
+
+    versions = {
+        "${task.process}": {
+            "python": platform.python_version(),
+            "pandas": pd.__version__,
+            "numpy": np.__version__,
+            "scikit-learn": sklearn.__version__,
+        }
+    }
+    with open("versions.yml", "w") as fh:
+        yaml.dump(versions, fh, default_flow_style=False, sort_keys=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/modules/nf-core/custom/clustering/tests/data/test.eigenvec b/modules/nf-core/custom/clustering/tests/data/test.eigenvec
@@ -0,0 +1,6 @@
+#FID	IID	PC1	PC2	PC3
+0	sample01	0.1234	0.5678	0.9012
+0	sample02	-0.2345	0.6789	-0.0123
+0	sample03	0.3456	-0.7890	0.1234
+0	sample04	-0.4567	0.8901	-0.2345
+0	sample05	0.5678	-0.9012	0.3456
diff --git a/modules/nf-core/custom/clustering/tests/main.nf.test b/modules/nf-core/custom/clustering/tests/main.nf.test
@@ -0,0 +1,85 @@
+nextflow_process {
+    name "Test Process CUSTOM_CLUSTERING"
+    script "../main.nf"
+    process "CUSTOM_CLUSTERING"
+
+    tag "modules"
+    tag "modules_nfcore"
+    tag "custom"
+    tag "custom/clustering"
+
+    test("clustering - eigenvec") {
+        when {
+            process {
+                """
+                input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustering/tests/data/test.eigenvec", checkIfExists: true) ]
+                input[1] = 'kmeans'
+                input[2] = 3
+                input[3] = 0.5
+                input[4] = 5
+                """
+            }
+        }
+        then {
+            // KMeans inertia varies by a few ULPs across CPU instruction sets
+            // (BLAS reduction order), so parse the JSON and round it for the
+            // snapshot. Production output keeps full precision.
+            def info_data = new groovy.json.JsonSlurper().parse(file(process.out.info[0][1]))
+            info_data.inertia = (info_data.inertia as Double).round(4)
+
+            assertAll(
+                { assert process.success },
+                { assert snapshot(
+                    process.out.clusters,
+                    info_data,
+                    process.out.versions
+                ).match() }
+            )
+        }
+    }
+
+    test("clustering - eigenvec - dbscan") {
+        when {
+            process {
+                """
+                input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustering/tests/data/test.eigenvec", checkIfExists: true) ]
+                input[1] = 'dbscan'
+                input[2] = 3
+                input[3] = 0.5
+                input[4] = 2
+                """
+            }
+        }
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(
+                    process.out.clusters,
+                    process.out.info,
+                    process.out.versions
+                ).match() }
+            )
+        }
+    }
+
+    test("clustering - eigenvec - stub") {
+        options "-stub"
+        when {
+            process {
+                """
+                input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustering/tests/data/test.eigenvec", checkIfExists: true) ]
+                input[1] = 'kmeans'
+                input[2] = 3
+                input[3] = 0.5
+                input[4] = 5
+                """
+            }
+        }
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+    }
+}