nf-omicstd/main_dsl1.nf at master · becavin-lab/nf-omicstd · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#!/usr/bin/env nextflow

params.project = "EcoliRNASeq"

params.nbcpu = "8"

params.bio_cond = ["WT_1","WT_2","MazF_1","MazF_2"]
params.ftp_ebi = ["005/ERR2686025/ERR2686025" , "006/ERR2686026/ERR2686026", "007/ERR2686027/ERR2686027", "008/ERR2686028/ERR2686028"]
params.ebi_fq = "https://ftp.sra.ebi.ac.uk/vol1/fastq/ERR268/"

params.fna_ftp = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.fna.gz"
params.gtf_ftp = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.gtf.gz"
params.genome_name = "Ecoli_K12"


process init_folder {

    output:
    stdout

    script:
    """
    if [ ! -d ${params.rawdata} ]; then
        mkdir ${params.rawdata}
    fi
    if [ ! -d ${params.genome} ]; then
        mkdir ${params.genome}
    fi
    if [ ! -d ${params.fastq} ]; then
        mkdir ${params.fastq}
    fi
    if [ ! -d ${params.mapping} ]; then
        mkdir ${params.mapping}
    fi
    if [ ! -d ${params.count} ]; then
        mkdir ${params.count}
    fi
    """
}


/*
 * Download genome files
 * We have now to download the genome file of Escherichis coli K-12. We download first the sequence (fasta file) and then the annotation (gff file).
 */
process dlGenome {
    debug true

    input:
    stdin

    output:
    val "${params.genome}/${params.genome_name}" into dl_genome
    val "${params.genome}/${params.genome_name}.gtf" into genome_annot

    script:
    """
    fna_file=${params.genome}/${params.genome_name}.fna
    gtf_file=${params.genome}/${params.genome_name}.gtf
    if [ ! -f \$fna_file ]; then
        echo "Downloaded fasta file" ${params.fna_ftp}
        wget -c -O \${fna_file}.gz ${params.fna_ftp}
        gunzip -d \${fna_file}.gz
    fi
    if [ ! -f \$gtf_file ]; then
        echo "Downloaded fasta file" ${params.gtf_ftp}
        wget -c -O \${gtf_file}.gz ${params.gtf_ftp}
        gunzip -d \${gtf_file}.gz
    fi
    """
}


process index_genome {
    input:
    val genome_path from dl_genome

    output:
    val "${genome_path}" into genome_index
    """
    data=${genome_path}.1.bt2
    if [ ! -f \$data ]; then
        bowtie2-build --threads $params.nbcpu ${genome_path}.fna ${genome_path}
    fi
    """
}


/*
 * Download fastq from EBI
 * Test for eachf ile if the dataset was not already downloaded
 */
 process dlFastQ {
    debug true

    input:
    stdin from folderCreated
    val bio_cond from bio_conds
    val ftp_ebi from ftp_ebis

    output:
    val "${params.rawdata}/${bio_cond}_R1.fastq.gz" into data_R1
    val "${params.rawdata}/${bio_cond}_R2.fastq.gz" into data_R2

    script:
    """
    data=${params.rawdata}/${bio_cond}_R1.fastq.gz
    if [ ! -f \$data ]; then
        wget -O \$data ${params.ebi_fq}${ftp_ebi}_1.fastq.gz
    fi
    data=${params.rawdata}/${bio_cond}_R2.fastq.gz
    if [ ! -f \$data ]; then
        wget -O \$data ${params.ebi_fq}${ftp_ebi}_2.fastq.gz
    fi
    """
}


/*
 * Run cutadapt for all pair of reads
 */
process cutadapt {
    input:
    val bio_cond from Channel.fromList(params.bio_cond)
    val fastq_R1 from data_R1
    val fastq_R2 from data_R2

    output:
    val "${bio_cond}_R1" into data_cut_R1
    val "${bio_cond}_R2" into data_cut_R2
    val "${bio_cond}_R1" into rawdata_cut_R1
    val "${bio_cond}_R2" into rawdata_cut_R2

    script:
    """
    data=${params.fastq}/${bio_cond}_R1.fastq
    if [ ! -f \$data ]; then
        cutadapt -j $params.nbcpu -b $params.read1Adapt -B $params.read2Adapt -o ${params.fastq}/${bio_cond}_R1.fastq -p ${params.fastq}/${bio_cond}_R2.fastq ${fastq_R1} ${fastq_R2}
    fi
    """
}


/*
 * Run mapping of reads with bowtie2
 */
process mapping {
    input:
    val genome from genome_index
    val data_R1 from rawdata_cut_R1
    val data_R1 from rawdata_cut_R2
    val biocond from Channel.fromList(params.bio_cond)

    output:
    val biocond into sam_files

    """
    data=${params.mapping}/${biocond}.sam
    if [ ! -f \$data ]; then
        bowtie2 -p $params.nbcpu -x $genome -1 ${params.fastq}/${data_R1}.fastq -2 ${params.fastq}/${data_R1}.fastq -S ${params.mapping}/${biocond}.sam > ${params.mapping}/${biocond}.log
    fi
    """
}

/*
 * Convert sam files to bam files
 */
process sam_to_bam {

    debug true
    input:
    val sam_file from sam_files

    output:
    val "${sam_file}" into bam_files

    """
    data=${params.mapping}/${sam_file}.bam
    if [ ! -f \$data ]; then
        samtools view --threads $params.nbcpu -b -q 1 ${params.mapping}/${sam_file}.sam > ${params.mapping}/${sam_file}_raw.bam
        samtools sort --threads $params.nbcpu -o ${params.mapping}/${sam_file}.bam ${params.mapping}/${sam_file}_raw.bam
        samtools index -@ $params.nbcpu ${params.mapping}/${sam_file}.bam
        rm ${params.mapping}/${sam_file}_raw.bam
    fi
    """
}

/*
 * Count number of reads for every gene
 */
process gene_count {
    input:
    val bam_file from bam_files
    val genome from genome_annot

    output:
    stdout into mappFinish

    script:
    """
    data=${params.count}/${bam_file}.txt
    if [ ! -f \$data ]; then
        featureCounts -T $params.nbcpu -t "gene" -a ${genome} -o ${params.count}/${bam_file}.txt ${params.mapping}/${bam_file}.bam
    fi
    """
}