Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
9abde15
New modules: cluster_metrics + cluster_viz
dbaku42 Apr 29, 2026
0320a41
Move custom clustering modules under custom
dbaku42 May 4, 2026
e424338
Move custom clustering modules under custom
dbaku42 May 4, 2026
670bc33
Fix custom clustering module lint
dbaku42 May 4, 2026
3a26119
Fix custom clustering module lint and snapshots
dbaku42 May 4, 2026
e4877e4
Address review comments for clustering custom modules
dbaku42 May 6, 2026
7e19b22
Fix custom clustering module metadata
dbaku42 May 6, 2026
fdb1be4
Add Dockerfile for custom/clustermetrics and custom/clustervisualiation
dbaku42 May 7, 2026
d3a382b
Add container directive for custom/clustermetrics and clustervisualia…
dbaku42 May 7, 2026
89287db
Update modules/nf-core/custom/clustermetrics/main.nf
dbaku42 May 8, 2026
d5ac668
Update modules/nf-core/custom/clustermetrics/main.nf
dbaku42 May 8, 2026
55e868a
Update modules/nf-core/custom/clustermetrics/main.nf
dbaku42 May 8, 2026
e492edb
Update modules/nf-core/custom/clustermetrics/main.nf
dbaku42 May 8, 2026
b8529f6
Update modules/nf-core/custom/clustervisualiation/templates/cluster_v…
dbaku42 May 8, 2026
12bb00d
Update modules/nf-core/custom/clustervisualiation/templates/cluster_v…
dbaku42 May 8, 2026
a4a01a3
Update modules/nf-core/custom/clustervisualiation/templates/cluster_v…
dbaku42 May 8, 2026
060eb5d
fix: use template for cluster visualization module
dbaku42 May 8, 2026
d48a224
style: clean cluster visualization module main
dbaku42 May 8, 2026
2d85309
fix: address reviewer feedback for cluster modules
dbaku42 May 8, 2026
387cbeb
fix: address pinin4fjords follow-up review - template escaping, drop …
dbaku42 May 11, 2026
4ed380b
feat(custom/clustervisualization): add UMAP and t-SNE cluster visuali…
dbaku42 May 11, 2026
4fe91e7
fix: apply ruff formatting to cluster_viz.py template
dbaku42 May 11, 2026
aaded95
Merge branch 'master' into add-cluster-metrics-viz
dbaku42 May 11, 2026
0980208
fix: align clustermetrics and clustervisualization envs and containers
dbaku42 May 11, 2026
f240623
fix: use docker:// prefix for singularity container to enable OCI con…
dbaku42 May 12, 2026
c46c26c
fix(custom/clustervisualization): set NUMBA_CACHE_DIR and MPLCONFIGDI…
dbaku42 May 12, 2026
ad39971
fix(custom/clustervisualization): move NUMBA_CACHE_DIR fix before any…
dbaku42 May 12, 2026
c8fec15
Apply suggestion from @pinin4fjords
dbaku42 May 12, 2026
3634466
Apply suggestion from @pinin4fjords
dbaku42 May 12, 2026
0ca17c9
Merge branch 'master' into add-cluster-metrics-viz
dbaku42 May 12, 2026
6a7ac45
Prek and script fix
dbaku42 May 12, 2026
cddb5a8
Fixed pandas series problem in cluster_metrics.py
dbaku42 May 13, 2026
4a72291
fix: escape \n in f-strings for Groovy template compatibility
dbaku42 May 13, 2026
2bf8afa
Format CUSTOM_CLUSTERMETRICS template with ruff
dbaku42 May 13, 2026
fd15709
fix(clustermetrics,clustervisualization): update nf-test snapshots an…
dbaku42 May 13, 2026
792373b
Merge branch 'master' into add-cluster-metrics-viz
dbaku42 May 13, 2026
6114ebd
fix environment conflict
dbaku42 May 13, 2026
8348609
feat: add custom clustering and metrics modules
dbaku42 May 13, 2026
f1915bd
Merge branch 'master' into add-cluster-metrics-viz
dbaku42 May 13, 2026
ac9a4ba
Clean up cluster modules and fix CI
pinin4fjords May 14, 2026
ea89bb2
Use yaml.dump for versions.yml in cluster modules
pinin4fjords May 14, 2026
51a747b
Fix portability of CUSTOM_CLUSTERING and CUSTOM_CLUSTERVISUALIZATION CI
pinin4fjords May 14, 2026
39baf33
Tidy cluster module templates
pinin4fjords May 14, 2026
465c8e4
Revert .gitignore drift unrelated to the PR
pinin4fjords May 14, 2026
9870458
Address findings from fresh review pass
pinin4fjords May 14, 2026
3f5a5fa
Align modules with nf-core spec: ext.args + dot-separated outputs
pinin4fjords May 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions modules/nf-core/custom/clustering/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
channels:
- conda-forge
- bioconda
dependencies:
- conda-forge::numpy=2.4.4
- conda-forge::pandas=3.0.3
- conda-forge::python=3.12.13
- conda-forge::pyyaml=6.0.3
- conda-forge::scikit-learn=1.8.0
42 changes: 42 additions & 0 deletions modules/nf-core/custom/clustering/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
process CUSTOM_CLUSTERING {
tag "$meta.id"
label 'process_medium'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ?
'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/a3/a37807bdaf3edad30a2b212962b6af381bc10381a80c40efb2bb07f6ee43032f/data' :
'community.wave.seqera.io/library/numpy_pandas_python_pyyaml_scikit-learn:c500ceb82d3d7606' }"

input:
tuple val(meta), path(eigenvec)
val algorithm
val n_clusters
val dbscan_eps
val dbscan_min_samples

output:
tuple val(meta), path("*.clusters.csv") , emit: clusters
tuple val(meta), path("*.clustering_info.json"), emit: info
path "versions.yml" , emit: versions, topic: versions

when:
task.ext.when == null || task.ext.when

script:
template 'clustering.py'

stub:
def prefix = task.ext.prefix ?: "${meta.id}"
"""
touch ${prefix}.clusters.csv
touch ${prefix}.clustering_info.json

cat <<-END_VERSIONS > versions.yml
"${task.process}":
python: \$(python3 --version | sed 's/Python //')
numpy: \$(python3 -c "from importlib.metadata import version; print(version('numpy'))")
pandas: \$(python3 -c "from importlib.metadata import version; print(version('pandas'))")
scikit-learn: \$(python3 -c "from importlib.metadata import version; print(version('scikit-learn'))")
END_VERSIONS
"""
}
77 changes: 77 additions & 0 deletions modules/nf-core/custom/clustering/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
name: "CUSTOM_CLUSTERING"
description: "Performs KMeans or DBSCAN clustering on principal components from PLINK2
--pca"
keywords:
- clustering
- pca
- kmeans
- dbscan
- principal-components
tools:
- "scikit-learn":
description: "Machine learning library for clustering"
homepage: "https://scikit-learn.org/"
documentation: "https://scikit-learn.org/stable/modules/clustering.html"
licence:
- "BSD-3-Clause"
identifier: ""
input:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'sample1' ]`
- eigenvec:
type: file
description: PLINK2 .eigenvec file generated by --pca
pattern: "*.eigenvec"
ontologies: []
- algorithm:
type: string
description: Clustering algorithm to use (kmeans or dbscan)
- n_clusters:
type: integer
description: Number of clusters for KMeans
- dbscan_eps:
type: float
description: Epsilon parameter for DBSCAN
- dbscan_min_samples:
type: integer
description: Minimum samples parameter for DBSCAN
output:
clusters:
- - meta:
type: map
description: Groovy Map containing sample information
- "*.clusters.csv":
type: file
description: CSV file with sample_id and assigned cluster
pattern: "*.clusters.csv"
ontologies:
- edam: http://edamontology.org/format_3752
info:
- - meta:
type: map
description: Groovy Map containing sample information
- "*.clustering_info.json":
type: file
description: JSON file with clustering parameters and statistics
pattern: "*.clustering_info.json"
ontologies:
- edam: http://edamontology.org/format_3464
versions:
- "versions.yml":
type: file
description: File containing software versions
pattern: "versions.yml"
ontologies:
- edam: http://edamontology.org/format_3750
topics:
versions:
- versions.yml:
type: string
description: The name of the process
authors:
- "@dbaku42"
maintainers:
- "@dbaku42"
83 changes: 83 additions & 0 deletions modules/nf-core/custom/clustering/templates/clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/usr/bin/env python3

import json
import platform

import numpy as np
import pandas as pd
import sklearn
import yaml
from sklearn.cluster import DBSCAN, KMeans


def parse_eigenvec(path):
"""Parse a PLINK2 .eigenvec file into (sample_ids: pd.Series, pcs: np.ndarray).

Accepts the FID/IID and IID-only header layouts PLINK2 emits, plus the
leading '#' on the header line. Sample IDs are read from the IID column.
"""
df = pd.read_csv(path, sep=r"\\s+", engine="python")
df.columns = [c.lstrip("#") for c in df.columns]
cols_upper = [c.upper() for c in df.columns]
if cols_upper[:2] == ["FID", "IID"]:
id_cols = df.columns[:2]
elif cols_upper[:1] == ["IID"]:
id_cols = df.columns[:1]
else:
raise ValueError(f"eigenvec file missing IID header: {list(df.columns)}")
sample_ids = df["IID"].astype(str)
pcs = df.drop(columns=id_cols).to_numpy(dtype=float)
return sample_ids, pcs


def main():
eigenvec = "$eigenvec"
algorithm = "$algorithm"
n_clusters = int("$n_clusters")
dbscan_eps = float("$dbscan_eps")
dbscan_min_samples = int("$dbscan_min_samples")
prefix = "${task.ext.prefix ?: meta.id}"

sample_ids, x = parse_eigenvec(eigenvec)

if algorithm == "kmeans":
model = KMeans(n_clusters=n_clusters, init="random", n_init=100, random_state=42)
labels = model.fit_predict(x)
info = {
"algorithm": "kmeans",
"k": n_clusters,
"inertia": float(model.inertia_),
}
elif algorithm == "dbscan":
model = DBSCAN(eps=dbscan_eps, min_samples=dbscan_min_samples)
labels = model.fit_predict(x)
info = {
"algorithm": "dbscan",
"eps": dbscan_eps,
"min_samples": dbscan_min_samples,
"n_clusters_found": len(set(labels) - {-1}),
"n_noise": int(np.sum(labels == -1)),
}
else:
raise ValueError(f"Unknown algorithm '{algorithm}' (expected 'kmeans' or 'dbscan')")

info |= {"n_samples": int(x.shape[0]), "n_features": int(x.shape[1])}

pd.DataFrame({"sample_id": sample_ids, "cluster": labels}).to_csv(f"{prefix}.clusters.csv", index=False)
with open(f"{prefix}.clustering_info.json", "w") as fh:
json.dump(info, fh, indent=2)

versions = {
"${task.process}": {
"python": platform.python_version(),
"pandas": pd.__version__,
"numpy": np.__version__,
"scikit-learn": sklearn.__version__,
}
}
with open("versions.yml", "w") as fh:
yaml.dump(versions, fh, default_flow_style=False, sort_keys=False)


if __name__ == "__main__":
main()
6 changes: 6 additions & 0 deletions modules/nf-core/custom/clustering/tests/data/test.eigenvec
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#FID IID PC1 PC2 PC3
0 sample01 0.1234 0.5678 0.9012
0 sample02 -0.2345 0.6789 -0.0123
0 sample03 0.3456 -0.7890 0.1234
0 sample04 -0.4567 0.8901 -0.2345
0 sample05 0.5678 -0.9012 0.3456
85 changes: 85 additions & 0 deletions modules/nf-core/custom/clustering/tests/main.nf.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
nextflow_process {
name "Test Process CUSTOM_CLUSTERING"
script "../main.nf"
process "CUSTOM_CLUSTERING"

tag "modules"
tag "modules_nfcore"
tag "custom"
tag "custom/clustering"

test("clustering - eigenvec") {
when {
process {
"""
input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustering/tests/data/test.eigenvec", checkIfExists: true) ]
input[1] = 'kmeans'
input[2] = 3
input[3] = 0.5
input[4] = 5
"""
}
}
then {
// KMeans inertia varies by a few ULPs across CPU instruction sets
// (BLAS reduction order), so parse the JSON and round it for the
// snapshot. Production output keeps full precision.
def info_data = new groovy.json.JsonSlurper().parse(file(process.out.info[0][1]))
info_data.inertia = (info_data.inertia as Double).round(4)

assertAll(
{ assert process.success },
{ assert snapshot(
process.out.clusters,
info_data,
process.out.versions
).match() }
)
}
}

test("clustering - eigenvec - dbscan") {
when {
process {
"""
input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustering/tests/data/test.eigenvec", checkIfExists: true) ]
input[1] = 'dbscan'
input[2] = 3
input[3] = 0.5
input[4] = 2
"""
}
}
then {
assertAll(
{ assert process.success },
{ assert snapshot(
process.out.clusters,
process.out.info,
process.out.versions
).match() }
)
}
}

test("clustering - eigenvec - stub") {
options "-stub"
when {
process {
"""
input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustering/tests/data/test.eigenvec", checkIfExists: true) ]
input[1] = 'kmeans'
input[2] = 3
input[3] = 0.5
input[4] = 5
"""
}
}
then {
assertAll(
{ assert process.success },
{ assert snapshot(process.out).match() }
)
}
}
}
Loading
Loading