diff --git a/modules/nf-core/custom/clustering/environment.yml b/modules/nf-core/custom/clustering/environment.yml new file mode 100644 index 000000000000..1fda7f0e4c10 --- /dev/null +++ b/modules/nf-core/custom/clustering/environment.yml @@ -0,0 +1,11 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::numpy=2.4.4 + - conda-forge::pandas=3.0.3 + - conda-forge::python=3.12.13 + - conda-forge::pyyaml=6.0.3 + - conda-forge::scikit-learn=1.8.0 diff --git a/modules/nf-core/custom/clustering/main.nf b/modules/nf-core/custom/clustering/main.nf new file mode 100644 index 000000000000..e0eae4844bfe --- /dev/null +++ b/modules/nf-core/custom/clustering/main.nf @@ -0,0 +1,42 @@ +process CUSTOM_CLUSTERING { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/a3/a37807bdaf3edad30a2b212962b6af381bc10381a80c40efb2bb07f6ee43032f/data' : + 'community.wave.seqera.io/library/numpy_pandas_python_pyyaml_scikit-learn:c500ceb82d3d7606' }" + + input: + tuple val(meta), path(eigenvec) + val algorithm + val n_clusters + val dbscan_eps + val dbscan_min_samples + + output: + tuple val(meta), path("*.clusters.csv") , emit: clusters + tuple val(meta), path("*.clustering_info.json"), emit: info + path "versions.yml" , emit: versions, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'clustering.py' + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.clusters.csv + touch ${prefix}.clustering_info.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //') + numpy: \$(python3 -c "from importlib.metadata import version; print(version('numpy'))") + pandas: \$(python3 -c "from importlib.metadata import version; print(version('pandas'))") + scikit-learn: \$(python3 -c "from importlib.metadata import version; print(version('scikit-learn'))") + END_VERSIONS + """ +} diff --git a/modules/nf-core/custom/clustering/meta.yml b/modules/nf-core/custom/clustering/meta.yml new file mode 100644 index 000000000000..69419621ea5f --- /dev/null +++ b/modules/nf-core/custom/clustering/meta.yml @@ -0,0 +1,77 @@ +name: "CUSTOM_CLUSTERING" +description: "Performs KMeans or DBSCAN clustering on principal components from PLINK2 + --pca" +keywords: + - clustering + - pca + - kmeans + - dbscan + - principal-components +tools: + - "scikit-learn": + description: "Machine learning library for clustering" + homepage: "https://scikit-learn.org/" + documentation: "https://scikit-learn.org/stable/modules/clustering.html" + licence: + - "BSD-3-Clause" + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - eigenvec: + type: file + description: PLINK2 .eigenvec file generated by --pca + pattern: "*.eigenvec" + ontologies: [] + - algorithm: + type: string + description: Clustering algorithm to use (kmeans or dbscan) + - n_clusters: + type: integer + description: Number of clusters for KMeans + - dbscan_eps: + type: float + description: Epsilon parameter for DBSCAN + - dbscan_min_samples: + type: integer + description: Minimum samples parameter for DBSCAN +output: + clusters: + - - meta: + type: map + description: Groovy Map containing sample information + - "*.clusters.csv": + type: file + description: CSV file with sample_id and assigned cluster + pattern: "*.clusters.csv" + ontologies: + - edam: http://edamontology.org/format_3752 + info: + - - meta: + type: map + description: Groovy Map containing sample information + - "*.clustering_info.json": + type: file + description: JSON file with clustering parameters and statistics + pattern: "*.clustering_info.json" + ontologies: + - edam: http://edamontology.org/format_3464 + versions: + - "versions.yml": + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 +topics: + versions: + - versions.yml: + type: string + description: The name of the process +authors: + - "@dbaku42" +maintainers: + - "@dbaku42" diff --git a/modules/nf-core/custom/clustering/templates/clustering.py b/modules/nf-core/custom/clustering/templates/clustering.py new file mode 100644 index 000000000000..a8514048aaac --- /dev/null +++ b/modules/nf-core/custom/clustering/templates/clustering.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +import json +import platform + +import numpy as np +import pandas as pd +import sklearn +import yaml +from sklearn.cluster import DBSCAN, KMeans + + +def parse_eigenvec(path): + """Parse a PLINK2 .eigenvec file into (sample_ids: pd.Series, pcs: np.ndarray). + + Accepts the FID/IID and IID-only header layouts PLINK2 emits, plus the + leading '#' on the header line. Sample IDs are read from the IID column. + """ + df = pd.read_csv(path, sep=r"\\s+", engine="python") + df.columns = [c.lstrip("#") for c in df.columns] + cols_upper = [c.upper() for c in df.columns] + if cols_upper[:2] == ["FID", "IID"]: + id_cols = df.columns[:2] + elif cols_upper[:1] == ["IID"]: + id_cols = df.columns[:1] + else: + raise ValueError(f"eigenvec file missing IID header: {list(df.columns)}") + sample_ids = df["IID"].astype(str) + pcs = df.drop(columns=id_cols).to_numpy(dtype=float) + return sample_ids, pcs + + +def main(): + eigenvec = "$eigenvec" + algorithm = "$algorithm" + n_clusters = int("$n_clusters") + dbscan_eps = float("$dbscan_eps") + dbscan_min_samples = int("$dbscan_min_samples") + prefix = "${task.ext.prefix ?: meta.id}" + + sample_ids, x = parse_eigenvec(eigenvec) + + if algorithm == "kmeans": + model = KMeans(n_clusters=n_clusters, init="random", n_init=100, random_state=42) + labels = model.fit_predict(x) + info = { + "algorithm": "kmeans", + "k": n_clusters, + "inertia": float(model.inertia_), + } + elif algorithm == "dbscan": + model = DBSCAN(eps=dbscan_eps, min_samples=dbscan_min_samples) + labels = model.fit_predict(x) + info = { + "algorithm": "dbscan", + "eps": dbscan_eps, + "min_samples": dbscan_min_samples, + "n_clusters_found": len(set(labels) - {-1}), + "n_noise": int(np.sum(labels == -1)), + } + else: + raise ValueError(f"Unknown algorithm '{algorithm}' (expected 'kmeans' or 'dbscan')") + + info |= {"n_samples": int(x.shape[0]), "n_features": int(x.shape[1])} + + pd.DataFrame({"sample_id": sample_ids, "cluster": labels}).to_csv(f"{prefix}.clusters.csv", index=False) + with open(f"{prefix}.clustering_info.json", "w") as fh: + json.dump(info, fh, indent=2) + + versions = { + "${task.process}": { + "python": platform.python_version(), + "pandas": pd.__version__, + "numpy": np.__version__, + "scikit-learn": sklearn.__version__, + } + } + with open("versions.yml", "w") as fh: + yaml.dump(versions, fh, default_flow_style=False, sort_keys=False) + + +if __name__ == "__main__": + main() diff --git a/modules/nf-core/custom/clustering/tests/data/test.eigenvec b/modules/nf-core/custom/clustering/tests/data/test.eigenvec new file mode 100644 index 000000000000..d0281ae180ce --- /dev/null +++ b/modules/nf-core/custom/clustering/tests/data/test.eigenvec @@ -0,0 +1,6 @@ +#FID IID PC1 PC2 PC3 +0 sample01 0.1234 0.5678 0.9012 +0 sample02 -0.2345 0.6789 -0.0123 +0 sample03 0.3456 -0.7890 0.1234 +0 sample04 -0.4567 0.8901 -0.2345 +0 sample05 0.5678 -0.9012 0.3456 diff --git a/modules/nf-core/custom/clustering/tests/main.nf.test b/modules/nf-core/custom/clustering/tests/main.nf.test new file mode 100644 index 000000000000..4fa0f01c2441 --- /dev/null +++ b/modules/nf-core/custom/clustering/tests/main.nf.test @@ -0,0 +1,85 @@ +nextflow_process { + name "Test Process CUSTOM_CLUSTERING" + script "../main.nf" + process "CUSTOM_CLUSTERING" + + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "custom/clustering" + + test("clustering - eigenvec") { + when { + process { + """ + input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustering/tests/data/test.eigenvec", checkIfExists: true) ] + input[1] = 'kmeans' + input[2] = 3 + input[3] = 0.5 + input[4] = 5 + """ + } + } + then { + // KMeans inertia varies by a few ULPs across CPU instruction sets + // (BLAS reduction order), so parse the JSON and round it for the + // snapshot. Production output keeps full precision. + def info_data = new groovy.json.JsonSlurper().parse(file(process.out.info[0][1])) + info_data.inertia = (info_data.inertia as Double).round(4) + + assertAll( + { assert process.success }, + { assert snapshot( + process.out.clusters, + info_data, + process.out.versions + ).match() } + ) + } + } + + test("clustering - eigenvec - dbscan") { + when { + process { + """ + input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustering/tests/data/test.eigenvec", checkIfExists: true) ] + input[1] = 'dbscan' + input[2] = 3 + input[3] = 0.5 + input[4] = 2 + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.clusters, + process.out.info, + process.out.versions + ).match() } + ) + } + } + + test("clustering - eigenvec - stub") { + options "-stub" + when { + process { + """ + input[0] = [ [id:'test'], file("${projectDir}/modules/nf-core/custom/clustering/tests/data/test.eigenvec", checkIfExists: true) ] + input[1] = 'kmeans' + input[2] = 3 + input[3] = 0.5 + input[4] = 5 + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/custom/clustering/tests/main.nf.test.snap b/modules/nf-core/custom/clustering/tests/main.nf.test.snap new file mode 100644 index 000000000000..1c0c8b71f1da --- /dev/null +++ b/modules/nf-core/custom/clustering/tests/main.nf.test.snap @@ -0,0 +1,106 @@ +{ + "clustering - eigenvec - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.clusters.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.clustering_info.json:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,b5973ab0aaba92ca53f2d4e2334b77f1" + ], + "clusters": [ + [ + { + "id": "test" + }, + "test.clusters.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "info": [ + [ + { + "id": "test" + }, + "test.clustering_info.json:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,b5973ab0aaba92ca53f2d4e2334b77f1" + ] + } + ], + "timestamp": "2026-05-14T12:03:40.330387209", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.1" + } + }, + "clustering - eigenvec": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.clusters.csv:md5,a0ce7a662fecdb42e15e2b2aa0906cf4" + ] + ], + { + "algorithm": "kmeans", + "k": 3, + "inertia": 0.1273, + "n_samples": 5, + "n_features": 3 + }, + [ + "versions.yml:md5,c554e11300cca236275d37f7795f5e74" + ] + ], + "timestamp": "2026-05-14T12:03:29.111664218", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.1" + } + }, + "clustering - eigenvec - dbscan": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.clusters.csv:md5,acdbd461fa3b9e4f69f30c9c72058172" + ] + ], + [ + [ + { + "id": "test" + }, + "test.clustering_info.json:md5,ae780a051f7a23704e62bfdbe1f07301" + ] + ], + [ + "versions.yml:md5,c554e11300cca236275d37f7795f5e74" + ] + ], + "timestamp": "2026-05-14T12:03:35.388350597", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.1" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/custom/clustermetrics/environment.yml b/modules/nf-core/custom/clustermetrics/environment.yml new file mode 100644 index 000000000000..8d7b4bbbb690 --- /dev/null +++ b/modules/nf-core/custom/clustermetrics/environment.yml @@ -0,0 +1,12 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::matplotlib=3.10.9 + - conda-forge::numpy=2.4.4 + - conda-forge::pandas=3.0.3 + - conda-forge::python=3.12.13 + - conda-forge::pyyaml=6.0.3 + - conda-forge::scikit-learn=1.8.0 diff --git a/modules/nf-core/custom/clustermetrics/main.nf b/modules/nf-core/custom/clustermetrics/main.nf new file mode 100644 index 000000000000..2eb096b08f40 --- /dev/null +++ b/modules/nf-core/custom/clustermetrics/main.nf @@ -0,0 +1,46 @@ +process CUSTOM_CLUSTERMETRICS { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/25/25129a5258522a434c386b800d3e2e3e6dc72d8a1171b7b10f21df3488526795/data' : + 'community.wave.seqera.io/library/matplotlib_numpy_pandas_python_pruned:169e228afc7d3686' }" + + input: + tuple val(meta), path(features), path(clusters) + + output: + tuple val(meta), path("*.metrics.tsv") , emit: metrics + tuple val(meta), path("*.k_sweep.csv") , emit: k_sweep + tuple val(meta), path("*.selected.json"), emit: selected + tuple val(meta), path("*.png") , emit: plots, optional: true + path "versions.yml" , emit: versions, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'cluster_metrics.py' + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.metrics.tsv + touch ${prefix}.k_sweep.csv + touch ${prefix}.selected.json + touch ${prefix}.elbow.png + touch ${prefix}.silhouette.png + touch ${prefix}.davies_bouldin.png + touch ${prefix}.calinski_harabasz.png + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //') + matplotlib: \$(python3 -c "from importlib.metadata import version; print(version('matplotlib'))") + numpy: \$(python3 -c "from importlib.metadata import version; print(version('numpy'))") + pandas: \$(python3 -c "from importlib.metadata import version; print(version('pandas'))") + scikit-learn: \$(python3 -c "from importlib.metadata import version; print(version('scikit-learn'))") + END_VERSIONS + """ +} diff --git a/modules/nf-core/custom/clustermetrics/meta.yml b/modules/nf-core/custom/clustermetrics/meta.yml new file mode 100644 index 000000000000..e364f55b6f75 --- /dev/null +++ b/modules/nf-core/custom/clustermetrics/meta.yml @@ -0,0 +1,96 @@ +name: "CUSTOM_CLUSTERMETRICS" +description: "Computes clustering quality metrics (silhouette, Calinski-Harabasz, + Davies-Bouldin) and performs k-sweep analysis" +keywords: + - clustering + - metrics + - silhouette + - calinski-harabasz + - davies-bouldin + - evaluation +tools: + - "scikit-learn": + description: "Machine learning library for clustering metrics" + homepage: "https://scikit-learn.org/" + documentation: "https://scikit-learn.org/stable/modules/clustering.html" + licence: + - "BSD-3-Clause" + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - features: + type: file + description: | + Tab-separated feature matrix with a `sample_id` column and one + column per numeric feature (e.g. PCA scores). + pattern: "*.tsv" + ontologies: + - edam: http://edamontology.org/format_3475 + - clusters: + type: file + description: | + Comma-separated cluster assignments with `sample_id` and integer + `cluster` columns. Label -1 is treated as DBSCAN noise. + pattern: "*.csv" + ontologies: + - edam: http://edamontology.org/format_3752 +output: + metrics: + - - meta: + type: map + description: Groovy Map containing sample information + - "*.metrics.tsv": + type: file + description: TSV with selected cluster quality metrics + pattern: "*.metrics.tsv" + ontologies: + - edam: http://edamontology.org/format_3475 + k_sweep: + - - meta: + type: map + description: Groovy Map containing sample information + - "*.k_sweep.csv": + type: file + description: CSV with metrics for different values of k + pattern: "*.k_sweep.csv" + ontologies: + - edam: http://edamontology.org/format_3752 + selected: + - - meta: + type: map + description: Groovy Map containing sample information + - "*.selected.json": + type: file + description: JSON with the selected/best metrics + pattern: "*.selected.json" + ontologies: + - edam: http://edamontology.org/format_3464 + plots: + - - meta: + type: map + description: Groovy Map containing sample information + - "*.png": + type: file + description: Optional PNG plots (elbow, silhouette, etc.) + pattern: "*.png" + ontologies: [] + versions: + - "versions.yml": + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 +topics: + versions: + - versions.yml: + type: string + description: The name of the process +authors: + - "@dbaku42" +maintainers: + - "@dbaku42" diff --git a/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py b/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py new file mode 100644 index 000000000000..ce00f5796ab0 --- /dev/null +++ b/modules/nf-core/custom/clustermetrics/templates/cluster_metrics.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 + +import argparse +import json +import platform +import shlex + +import matplotlib + +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import pandas as pd +import sklearn +import yaml +from sklearn.cluster import KMeans +from sklearn.metrics import ( + calinski_harabasz_score, + davies_bouldin_score, + silhouette_score, +) + + +def load_features(path): + """Read a TSV of `sample_id` + numeric feature columns, indexed by sample_id.""" + df = pd.read_csv(path, sep="\\t") + if "sample_id" not in df.columns: + raise ValueError(f"features file must have a 'sample_id' column. Found: {list(df.columns)}") + df["sample_id"] = df["sample_id"].astype(str) + return df.set_index("sample_id").apply(pd.to_numeric, errors="coerce").fillna(0.0) + + +def load_clusters(path): + """Read a CSV of `sample_id` + `cluster`, returning a Series of int labels.""" + df = pd.read_csv(path) + if "sample_id" not in df.columns or "cluster" not in df.columns: + raise ValueError(f"clusters file must have 'sample_id' and 'cluster' columns. Found: {list(df.columns)}") + df["sample_id"] = df["sample_id"].astype(str) + return df.set_index("sample_id")["cluster"].astype(int) + + +def cluster_quality(x, labels): + """Silhouette / Calinski-Harabasz / Davies-Bouldin for given (x, labels). + + Treats label -1 as DBSCAN noise and excludes those points. Returns None + for each score when fewer than 2 clusters of more than one point remain. + """ + mask = labels != -1 + x, labels = x[mask], labels[mask] + n = len(set(labels)) + valid = 2 <= n < len(x) + return { + "silhouette": float(silhouette_score(x, labels)) if valid else None, + "calinski_harabasz": float(calinski_harabasz_score(x, labels)) if valid else None, + "davies_bouldin": float(davies_bouldin_score(x, labels)) if valid else None, + } + + +def plot_curve(sweep_df, metric, title, ylabel, out_png): + plt.figure(figsize=(7, 4.5)) + vals = sweep_df[metric].dropna() + ks = sweep_df.loc[vals.index, "k"] + plt.plot(ks, vals, marker="o") + plt.xticks(sweep_df["k"].tolist()) + plt.title(title) + plt.xlabel("k") + plt.ylabel(ylabel) + plt.tight_layout() + plt.savefig(out_png, dpi=200) + plt.close() + + +def main(): + features = "$features" + clusters_path = "$clusters" + prefix = "${task.ext.prefix ?: meta.id}" + + # Optional configuration via task.ext.args (nf-core convention). + raw_args = "$task.ext.args" + parser = argparse.ArgumentParser() + parser.add_argument("--k-min", type=int, default=2) + parser.add_argument("--k-max", type=int, default=12) + opts = parser.parse_args(shlex.split(raw_args) if raw_args and raw_args != "null" else []) + + joined = load_features(features).join(load_clusters(clusters_path), how="inner") + if len(joined) < 2: + raise ValueError(f"Need at least 2 samples with matching sample_id in both inputs. Got {len(joined)}.") + + labels = joined["cluster"].values + x = joined.drop(columns=["cluster"]).to_numpy(dtype=float) + + # Quality metrics on the supplied labels. + selected = {"n_clusters": len(set(labels) - {-1}), **cluster_quality(x, labels)} + pd.DataFrame([selected]).to_csv(f"{prefix}.metrics.tsv", sep="\\t", index=False) + with open(f"{prefix}.selected.json", "w") as fh: + json.dump(selected, fh, indent=2) + + # KMeans k-sweep for downstream comparison. + rows = [] + for k in range(opts.k_min, min(opts.k_max, len(x)) + 1): + model = KMeans(n_clusters=k, n_init=10, random_state=42).fit(x) + rows.append({"k": k, "inertia": float(model.inertia_), **cluster_quality(x, model.labels_)}) + + sweep_df = pd.DataFrame(rows) + sweep_df.to_csv(f"{prefix}.k_sweep.csv", index=False, float_format="%.10g") + + if not sweep_df.empty: + plot_curve(sweep_df, "inertia", "Elbow method (KMeans inertia)", "inertia", f"{prefix}.elbow.png") + plot_curve( + sweep_df, "silhouette", "Silhouette score (higher is better)", "silhouette", f"{prefix}.silhouette.png" + ) + plot_curve( + sweep_df, + "davies_bouldin", + "Davies-Bouldin index (lower is better)", + "davies_bouldin", + f"{prefix}.davies_bouldin.png", + ) + plot_curve( + sweep_df, + "calinski_harabasz", + "Calinski-Harabasz index (higher is better)", + "calinski_harabasz", + f"{prefix}.calinski_harabasz.png", + ) + + versions = { + "${task.process}": { + "python": platform.python_version(), + "pandas": pd.__version__, + "scikit-learn": sklearn.__version__, + "matplotlib": matplotlib.__version__, + } + } + with open("versions.yml", "w") as fh: + yaml.dump(versions, fh, default_flow_style=False, sort_keys=False) + + +if __name__ == "__main__": + main() diff --git a/modules/nf-core/custom/clustermetrics/tests/data/test_clusters.csv b/modules/nf-core/custom/clustermetrics/tests/data/test_clusters.csv new file mode 100644 index 000000000000..1258849b8fbe --- /dev/null +++ b/modules/nf-core/custom/clustermetrics/tests/data/test_clusters.csv @@ -0,0 +1,6 @@ +sample_id,cluster +sample01,0 +sample02,2 +sample03,1 +sample04,2 +sample05,1 diff --git a/modules/nf-core/custom/clustermetrics/tests/data/test_features.tsv b/modules/nf-core/custom/clustermetrics/tests/data/test_features.tsv new file mode 100644 index 000000000000..033d23b82df8 --- /dev/null +++ b/modules/nf-core/custom/clustermetrics/tests/data/test_features.tsv @@ -0,0 +1,6 @@ +sample_id PC1 PC2 PC3 +sample01 0.1234 0.5678 0.9012 +sample02 -0.2345 0.6789 -0.0123 +sample03 0.3456 -0.7890 0.1234 +sample04 -0.4567 0.8901 -0.2345 +sample05 0.5678 -0.9012 0.3456 diff --git a/modules/nf-core/custom/clustermetrics/tests/main.nf.test b/modules/nf-core/custom/clustermetrics/tests/main.nf.test new file mode 100644 index 000000000000..f12915450448 --- /dev/null +++ b/modules/nf-core/custom/clustermetrics/tests/main.nf.test @@ -0,0 +1,70 @@ +nextflow_process { + + name "Test Process CUSTOM_CLUSTERMETRICS" + script "../main.nf" + process "CUSTOM_CLUSTERMETRICS" + + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "custom/clustermetrics" + + test("clustermetrics - features and clusters") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_features.tsv", checkIfExists: true), + file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_clusters.csv", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.metrics, + process.out.k_sweep, + process.out.selected, + process.out.versions, + path(process.out.versions[0]).yaml + ).match() } + ) + } + } + + test("clustermetrics - features and clusters - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_features.tsv", checkIfExists: true), + file("${projectDir}/modules/nf-core/custom/clustermetrics/tests/data/test_clusters.csv", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.metrics, + process.out.k_sweep, + process.out.selected, + process.out.plots, + process.out.versions, + path(process.out.versions[0]).yaml + ).match() } + ) + } + } +} diff --git a/modules/nf-core/custom/clustermetrics/tests/main.nf.test.snap b/modules/nf-core/custom/clustermetrics/tests/main.nf.test.snap new file mode 100644 index 000000000000..7123d5750cda --- /dev/null +++ b/modules/nf-core/custom/clustermetrics/tests/main.nf.test.snap @@ -0,0 +1,104 @@ +{ + "clustermetrics - features and clusters": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.metrics.tsv:md5,da21b5c4d100d3e2dd7b4dcdcf807e64" + ] + ], + [ + [ + { + "id": "test" + }, + "test.k_sweep.csv:md5,7cae1edd10fb924fc81b976aca423ec8" + ] + ], + [ + [ + { + "id": "test" + }, + "test.selected.json:md5,9b3862e30875dece4a91bdcd210a6ca1" + ] + ], + [ + "versions.yml:md5,602aa5dfe6c0b807d758c4f5cf3fc5e4" + ], + { + "CUSTOM_CLUSTERMETRICS": { + "python": "3.12.13", + "pandas": "3.0.3", + "scikit-learn": "1.8.0", + "matplotlib": "3.10.9" + } + } + ], + "timestamp": "2026-05-14T12:03:47.881833797", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.1" + } + }, + "clustermetrics - features and clusters - stub": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.metrics.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + [ + { + "id": "test" + }, + "test.k_sweep.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + [ + { + "id": "test" + }, + "test.selected.json:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + [ + { + "id": "test" + }, + [ + "test.calinski_harabasz.png:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.davies_bouldin.png:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.elbow.png:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.silhouette.png:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + [ + "versions.yml:md5,90240f0a22302d1727455b6be6e96abb" + ], + { + "CUSTOM_CLUSTERMETRICS": { + "python": "3.12.13", + "matplotlib": "3.10.9", + "numpy": "2.4.4", + "pandas": "3.0.3", + "scikit-learn": "1.8.0" + } + } + ], + "timestamp": "2026-05-14T12:03:52.822149059", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.1" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/custom/clustervisualization/environment.yml b/modules/nf-core/custom/clustervisualization/environment.yml new file mode 100644 index 000000000000..befadae8f312 --- /dev/null +++ b/modules/nf-core/custom/clustervisualization/environment.yml @@ -0,0 +1,14 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::matplotlib=3.10.9 + - conda-forge::numpy=2.4.4 + - conda-forge::pandas=3.0.3 + - conda-forge::python=3.12.13 + - conda-forge::pyyaml=6.0.3 + - conda-forge::scikit-learn=1.8.0 + - conda-forge::seaborn=0.13.2 + - conda-forge::umap-learn=0.5.12 diff --git a/modules/nf-core/custom/clustervisualization/main.nf b/modules/nf-core/custom/clustervisualization/main.nf new file mode 100644 index 000000000000..393c5345c636 --- /dev/null +++ b/modules/nf-core/custom/clustervisualization/main.nf @@ -0,0 +1,45 @@ +process CUSTOM_CLUSTERVISUALIZATION { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/64/64297e13d9d4f05ce543656e943a023735e7cb252d7534ccc9134d8b40423083/data' : + 'community.wave.seqera.io/library/matplotlib_numpy_pandas_python_pruned:826e4ab1361ff931' }" + + input: + tuple val(meta), path(features), path(clusters) + + output: + tuple val(meta), path("*.umap.tsv"), emit: umap_tsv + tuple val(meta), path("*.tsne.tsv"), emit: tsne_tsv + tuple val(meta), path("*.umap.png"), emit: umap_png, optional: true + tuple val(meta), path("*.tsne.png"), emit: tsne_png, optional: true + path "versions.yml" , emit: versions, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'cluster_viz.py' + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.umap.tsv + touch ${prefix}.tsne.tsv + touch ${prefix}.umap.png + touch ${prefix}.tsne.png + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version | sed 's/Python //') + matplotlib: \$(python3 -c "from importlib.metadata import version; print(version('matplotlib'))") + numpy: \$(python3 -c "from importlib.metadata import version; print(version('numpy'))") + pandas: \$(python3 -c "from importlib.metadata import version; print(version('pandas'))") + scikit-learn: \$(python3 -c "from importlib.metadata import version; print(version('scikit-learn'))") + seaborn: \$(python3 -c "from importlib.metadata import version; print(version('seaborn'))") + umap-learn: \$(python3 -c "from importlib.metadata import version; print(version('umap-learn'))") + END_VERSIONS + """ +} diff --git a/modules/nf-core/custom/clustervisualization/meta.yml b/modules/nf-core/custom/clustervisualization/meta.yml new file mode 100644 index 000000000000..62bed07814e5 --- /dev/null +++ b/modules/nf-core/custom/clustervisualization/meta.yml @@ -0,0 +1,101 @@ +name: "CUSTOM_CLUSTERVISUALIZATION" +description: "Generates UMAP and t-SNE visualizations colored by cluster" +keywords: + - clustering + - visualization + - pca + - umap + - tsne + - dimension-reduction +tools: + - scikit-learn: + description: "Machine learning library for dimension reduction (PCA, t-SNE)" + homepage: "https://scikit-learn.org/" + documentation: "https://scikit-learn.org/stable/modules/clustering.html" + licence: + - "BSD-3-Clause" + identifier: "" + - umap-learn: + description: "Uniform Manifold Approximation and Projection for dimension reduction" + homepage: "https://umap-learn.readthedocs.io/" + documentation: "https://umap-learn.readthedocs.io/en/latest/" + licence: + - "BSD-3-Clause" + identifier: "" +input: + - - meta: + type: map + description: "Groovy Map containing sample information" + - features: + type: file + description: | + Tab-separated feature matrix with a `sample_id` column and one + column per numeric feature (e.g. PCA scores). + pattern: "*.tsv" + ontologies: + - edam: http://edamontology.org/format_3475 + - clusters: + type: file + description: | + Comma-separated cluster assignments with `sample_id` and integer + `cluster` columns. Label -1 is treated as DBSCAN noise. + pattern: "*.csv" + ontologies: + - edam: http://edamontology.org/format_3752 +output: + umap_tsv: + - - meta: + type: map + description: "Groovy Map containing sample information" + - "*.umap.tsv": + type: file + description: "UMAP coordinates per sample" + pattern: "*.umap.tsv" + ontologies: + - edam: "http://edamontology.org/operation_2432" + - edam: http://edamontology.org/format_3475 + tsne_tsv: + - - meta: + type: map + description: "Groovy Map containing sample information" + - "*.tsne.tsv": + type: file + description: "t-SNE coordinates per sample" + pattern: "*.tsne.tsv" + ontologies: + - edam: "http://edamontology.org/operation_2432" + - edam: http://edamontology.org/format_3475 + umap_png: + - - meta: + type: map + description: "Groovy Map containing sample information" + - "*.umap.png": + type: file + description: "UMAP visualization coloured by cluster" + pattern: "*.umap.png" + ontologies: [] + tsne_png: + - - meta: + type: map + description: "Groovy Map containing sample information" + - "*.tsne.png": + type: file + description: "t-SNE visualization coloured by cluster" + pattern: "*.tsne.png" + ontologies: [] + versions: + - versions.yml: + type: file + description: "Software versions used in the module" + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 +topics: + versions: + - versions.yml: + type: string + description: The name of the process +authors: + - "@dbaku42" +maintainers: + - "@dbaku42" diff --git a/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py b/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py new file mode 100644 index 000000000000..55c1a7366112 --- /dev/null +++ b/modules/nf-core/custom/clustervisualization/templates/cluster_viz.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 + +import argparse +import os +import shlex + +# numba (UMAP) and matplotlib write caches; redirect to /tmp so the script works +# inside read-only container filesystems. +os.environ.setdefault("NUMBA_CACHE_DIR", "/tmp") +os.environ.setdefault("MPLCONFIGDIR", "/tmp") + +import platform + +import matplotlib + +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +import sklearn +import umap +import yaml +from sklearn.manifold import TSNE + + +def load_features(path): + """Read a TSV of `sample_id` + numeric feature columns, indexed by sample_id.""" + df = pd.read_csv(path, sep="\\t") + if "sample_id" not in df.columns: + raise ValueError(f"features file must have a 'sample_id' column. Found: {list(df.columns)}") + df["sample_id"] = df["sample_id"].astype(str) + return df.set_index("sample_id").apply(pd.to_numeric, errors="coerce").fillna(0.0) + + +def load_clusters(path): + """Read a CSV of `sample_id` + `cluster`, returning a Series of int labels.""" + df = pd.read_csv(path) + if "sample_id" not in df.columns or "cluster" not in df.columns: + raise ValueError(f"clusters file must have 'sample_id' and 'cluster' columns. Found: {list(df.columns)}") + df["sample_id"] = df["sample_id"].astype(str) + return df.set_index("sample_id")["cluster"].astype(int) + + +def embed(x, method, umap_neighbors, tsne_perplexity): + """Project x to 2D using UMAP or t-SNE. + + n_neighbors / perplexity are clamped against sample count so tiny test + inputs (where the user-specified defaults exceed n_samples) still run. + """ + n = len(x) + if method == "umap": + reducer = umap.UMAP(n_components=2, n_neighbors=min(umap_neighbors, max(2, n - 1)), random_state=42) + elif method == "tsne": + reducer = TSNE(n_components=2, perplexity=min(tsne_perplexity, max(2, n - 1)), random_state=42) + else: + raise ValueError(f"Unknown method '{method}' (expected 'umap' or 'tsne')") + return reducer.fit_transform(x) + + +def plot_embedding(emb, labels, method, out_png): + plt.figure(figsize=(8, 6)) + palette = sns.color_palette("tab10", n_colors=max(1, len(np.unique(labels)))) + sns.scatterplot( + x=emb[:, 0], + y=emb[:, 1], + hue=labels.astype(str), + palette=palette, + alpha=0.8, + s=60, + edgecolor="k", + linewidth=0.3, + ) + plt.title(f"{method.upper()} projection colored by cluster") + plt.xlabel(f"{method.upper()} 1") + plt.ylabel(f"{method.upper()} 2") + plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc="upper left") + plt.tight_layout() + plt.savefig(out_png, dpi=200, bbox_inches="tight") + plt.close() + + +def main(): + features = "$features" + clusters_path = "$clusters" + prefix = "${task.ext.prefix ?: meta.id}" + + # Optional configuration via task.ext.args (nf-core convention). + raw_args = "$task.ext.args" + parser = argparse.ArgumentParser() + parser.add_argument("--umap-neighbors", type=int, default=15) + parser.add_argument("--tsne-perplexity", type=int, default=30) + opts = parser.parse_args(shlex.split(raw_args) if raw_args and raw_args != "null" else []) + + joined = load_features(features).join(load_clusters(clusters_path), how="inner") + if len(joined) < 2: + raise ValueError(f"Need at least 2 samples with matching sample_id in both inputs. Got {len(joined)}.") + + labels = joined["cluster"].values + x = joined.drop(columns=["cluster"]).to_numpy(dtype=float) + sample_ids = joined.index.to_numpy() + + for method in ("umap", "tsne"): + emb = embed(x, method, opts.umap_neighbors, opts.tsne_perplexity) + pd.DataFrame({"sample_id": sample_ids, "Dim1": emb[:, 0], "Dim2": emb[:, 1], "cluster": labels}).to_csv( + f"{prefix}.{method}.tsv", sep="\\t", index=False + ) + plot_embedding(emb, labels, method, f"{prefix}.{method}.png") + + versions = { + "${task.process}": { + "python": platform.python_version(), + "pandas": pd.__version__, + "matplotlib": matplotlib.__version__, + "seaborn": sns.__version__, + "umap-learn": umap.__version__, + "scikit-learn": sklearn.__version__, + } + } + with open("versions.yml", "w") as fh: + yaml.dump(versions, fh, default_flow_style=False, sort_keys=False) + + +if __name__ == "__main__": + main() diff --git a/modules/nf-core/custom/clustervisualization/tests/data/test_clusters.csv b/modules/nf-core/custom/clustervisualization/tests/data/test_clusters.csv new file mode 100644 index 000000000000..1258849b8fbe --- /dev/null +++ b/modules/nf-core/custom/clustervisualization/tests/data/test_clusters.csv @@ -0,0 +1,6 @@ +sample_id,cluster +sample01,0 +sample02,2 +sample03,1 +sample04,2 +sample05,1 diff --git a/modules/nf-core/custom/clustervisualization/tests/data/test_features.tsv b/modules/nf-core/custom/clustervisualization/tests/data/test_features.tsv new file mode 100644 index 000000000000..033d23b82df8 --- /dev/null +++ b/modules/nf-core/custom/clustervisualization/tests/data/test_features.tsv @@ -0,0 +1,6 @@ +sample_id PC1 PC2 PC3 +sample01 0.1234 0.5678 0.9012 +sample02 -0.2345 0.6789 -0.0123 +sample03 0.3456 -0.7890 0.1234 +sample04 -0.4567 0.8901 -0.2345 +sample05 0.5678 -0.9012 0.3456 diff --git a/modules/nf-core/custom/clustervisualization/tests/main.nf.test b/modules/nf-core/custom/clustervisualization/tests/main.nf.test new file mode 100644 index 000000000000..b539835b90b9 --- /dev/null +++ b/modules/nf-core/custom/clustervisualization/tests/main.nf.test @@ -0,0 +1,69 @@ +nextflow_process { + + name "Test Process CUSTOM_CLUSTERVISUALIZATION" + script "../main.nf" + process "CUSTOM_CLUSTERVISUALIZATION" + + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "custom/clustervisualization" + + test("clustervisualization - features and clusters") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file("${projectDir}/modules/nf-core/custom/clustervisualization/tests/data/test_features.tsv", checkIfExists: true), + file("${projectDir}/modules/nf-core/custom/clustervisualization/tests/data/test_clusters.csv", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.umap_tsv, + process.out.tsne_tsv, + process.out.versions, + path(process.out.versions[0]).yaml + ).match() } + ) + } + } + + test("clustervisualization - features and clusters - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file("${projectDir}/modules/nf-core/custom/clustervisualization/tests/data/test_features.tsv", checkIfExists: true), + file("${projectDir}/modules/nf-core/custom/clustervisualization/tests/data/test_clusters.csv", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.umap_tsv, + process.out.tsne_tsv, + process.out.umap_png, + process.out.tsne_png, + process.out.versions, + path(process.out.versions[0]).yaml + ).match() } + ) + } + } +} diff --git a/modules/nf-core/custom/clustervisualization/tests/main.nf.test.snap b/modules/nf-core/custom/clustervisualization/tests/main.nf.test.snap new file mode 100644 index 000000000000..f5e516e7b634 --- /dev/null +++ b/modules/nf-core/custom/clustervisualization/tests/main.nf.test.snap @@ -0,0 +1,95 @@ +{ + "clustervisualization - features and clusters - stub": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.umap.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + [ + { + "id": "test" + }, + "test.tsne.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + [ + { + "id": "test" + }, + "test.umap.png:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + [ + { + "id": "test" + }, + "test.tsne.png:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + "versions.yml:md5,76a5042b303be74c24dee681187d6a1f" + ], + { + "CUSTOM_CLUSTERVISUALIZATION": { + "python": "3.12.13", + "matplotlib": "3.10.9", + "numpy": "2.4.4", + "pandas": "3.0.3", + "scikit-learn": "1.8.0", + "seaborn": "0.13.2", + "umap-learn": "0.5.12" + } + } + ], + "timestamp": "2026-05-14T10:35:02.296231051", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.1" + } + }, + "clustervisualization - features and clusters": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.umap.tsv:md5,bb67bbb071fba95e552527c9638bbc3d" + ] + ], + [ + [ + { + "id": "test" + }, + "test.tsne.tsv:md5,6096d8298c8a9762b77fa76a21fc3b91" + ] + ], + [ + "versions.yml:md5,a00bfbb9b1b4145177ec0e8a7406caf9" + ], + { + "CUSTOM_CLUSTERVISUALIZATION": { + "python": "3.12.13", + "pandas": "3.0.3", + "matplotlib": "3.10.9", + "seaborn": "0.13.2", + "umap-learn": "0.5.12", + "scikit-learn": "1.8.0" + } + } + ], + "timestamp": "2026-05-14T10:34:45.463800675", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.1" + } + } +} \ No newline at end of file