McIntyre-Lab · jlboat · Jun 27, 2021
diff --git a/bash/run_ase_align_and_count_testData.bash b/bash/run_ase_align_and_count_testData.bash
@@ -0,0 +1,220 @@
+#!/usr/bin/env bash
+
+## Set / Create Directories and Variables
+#Everything is relative to current directory
+SCRIPTS=hpc/ase_scripts
+REF=example_out/reference/updated_genomes_vcfConsensus
+ORIG=example_in/reads
+
+BEDFILE=example_in/reformatted_BASE_testdata_bedfile.bed
+
+## Create output directory for alignments to updated genome
+OUTALN=example_out/aln_upd_genome
+    if [ ! -e $OUTALN ]; then mkdir -p $OUTALN; fi
+PSAM=example_out/aln_upd_genome_bwa_parse
+    if [ ! -e $PSAM ]; then mkdir -p $PSAM; fi
+
+## Create output directory for ase counts from sam compare
+SAMC=example_out/ase_counts_updated
+    if [ ! -e $SAMC ]; then mkdir -p $SAMC; fi
+
+## Create output directory for upd aln summaries and checks
+CHKALN=example_out/check_aln
+    if [ ! -e $CHKALN ]; then mkdir -p $CHKALN; fi
+
+## Create output directory for Sam Compare check
+CHKSC=example_out/check_samcomp
+    if [ ! -e $CHKSC ]; then mkdir -p $CHKSC; fi
+
+# G1,G2,sampleID,fqName,fqExtension,readLength,techRep
+DESIGN_FILE=example_in/df_BASE_galaxy_W55_noEmptyFq.csv
+DESIGN=$(sed -n "${SLURM_ARRAY_TASK_ID}p" $DESIGN_FILE)
+IFS=',' read -ra ARRAY <<< "$DESIGN"
+
+G1=${ARRAY[0]}
+G2=${ARRAY[1]}
+READLEN=${ARRAY[5]}
+FQ=${ARRAY[3]}
+EXT=${ARRAY[4]}
+
+## Create temp dir
+ROZ=roz_${FQ}
+    mkdir -p ${ROZ}
+
+## set READ and calculate ave readlength
+READ=${ORIG}/${FQ}${EXT}
+echo " read is $READ 
+"
+
+AVE_RL=$(awk '{if(NR%4==2 && length) {count++; bases += length} } END {print bases/count}' ${READ} | awk '{ printf "%.0f\\n", $1 }')
+
+######  Create modified BED file to use in SAM compare - this bed has features 1st with start and end positions in genome
+## use awk to reorder columms in bed file
+SBED=example_out/snp_feature_first.bed
+
+echo "Reformatting $BEDFILE"
+awk -v OFS='\t' '{print $4,$2,$3,$1}' $BEDFILE > $SBED
+
+
+###### (1) Align Reads to Updated Genomes - first to G2 ref then to G1 reference - and Parse sam file
+FQLINEFN=$(wc -l $READ)
+FQLINE=$(echo $FQLINEFN | cut -d" " -f1)
+NUMREAD=$(( FQLINE / 4 )) 
+FN=$(echo $FQLINEFN | cut -d" " -f2)
+
+## count number of starting reads - same for G1 and G2 refs
+echo $NUMREAD | awk -v fq=${FQ} -v gq=pre_aln_read_count '{print "filename" "," gq "\n" fq "," $0}' > $CHKALN/pre_aln_reads_${FQ}.csv
+
+
+
+for FOO in G1 G2
+do
+    if [[ $FOO == 'G2' ]]
+    then
+        BREF=$REF/${G2}_snp_upd_genome_BWA
+
+        echo "Aligning G2 to reference $BREF"
+		bwa mem -t 8 -M $BREF $READ > $OUTALN/${G2}_${FQ}_upd.sam
+
+        echo -e "Start BWASplitSam on: $OUTALN/${G2}_${FQ}_upd.sam"
+        $SCRIPTS/BWASplitSAM_07mai.py -s $OUTALN/${G2}_${FQ}_upd.sam --outdir $ROZ -fq1 $READ
+
+        ## cat together mapped and opposite
+        cat $ROZ/${G2}_${FQ}_upd_mapped.sam $ROZ/${G2}_${FQ}_upd_oposite.sam > $PSAM/${G2}_${FQ}_upd_uniq.sam
+
+    elif [[ $FOO == 'G1' ]]
+    then
+        BREF=$REF/${G1}_snp_upd_genome_BWA
+
+        echo "Aligning G1 to reference $BREF"
+        bwa mem -t 8 -M $BREF $READ > $OUTALN/${G1}_${FQ}_upd.sam
+
+        echo -e "Start BWASplitSam on: $OUTALN/${G1}_${FQ}_upd.sam"
+        $SCRIPTS/BWASplitSAM_07mai.py -s $OUTALN/${G1}_${FQ}_upd.sam --outdir $ROZ -fq1 $READ
+
+        ## cat together mapped and opposite
+        cat $ROZ/${G1}_${FQ}_upd_mapped.sam $ROZ/${G1}_${FQ}_upd_oposite.sam > $PSAM/${G1}_${FQ}_upd_uniq.sam
+    fi
+done
+
+### move alignment summary csv files
+mv ${ROZ}/${G1}_${FQ}_upd_summary.csv ${CHKALN}/${G1}_${FQ}_upd_summary.csv
+mv ${ROZ}/${G2}_${FQ}_upd_summary.csv ${CHKALN}/${G2}_${FQ}_upd_summary.csv
+
+### for every FQ file run, should have 2 sam files (NOTE default is SE)
+echo "
+    checking for 2 sam files"
+
+python $SCRIPTS/check_sam_present_04amm.py \
+    -fq $FQ \
+    -alnType SE \
+    -samPath $PSAM \
+    -G1 $G1 \
+    -G2 $G2 \
+    -o $CHKALN/check_2_sam_files_${FQ}.txt
+
+## run python script to count reads into aln and in each SAM file
+echo "
+    checking for missing reads in sam files"
+
+python $SCRIPTS/check_for_lost_reads_05amm.py \
+    -a1 $CHKALN/${G1}_${FQ}_upd_summary.csv \
+    -a2 $CHKALN/${G2}_${FQ}_upd_summary.csv \
+    -numread $CHKALN/pre_aln_reads_${FQ}.csv \
+    -fq $FQ \
+    -o $CHKALN/check_start_reads_vs_aln_reads_${FQ}.csv
+
+
+## Insert bedtools intersect script + any checks  (reads in aln sam output)
+###### (2) Bedtools Intersect:   Here we will call the shell script to reformat the sam file so that the have feature names instead of CHR names
+## In parsed SAM, 
+
+for SAMFILE in $PSAM/*_${FQ}_upd_uniq.sam
+do
+    MYSAMFILE2=$(basename $SAMFILE)
+
+    AWKTMP=$PSAM/${MYSAMFILE2/_uniq.sam/_uniq_AWK.txt}
+    NEWSAM=$PSAM/${MYSAMFILE2/_uniq.sam/_uniq_FEATURE.sam}    
+
+    #Create a bed file to write the  starting position of every read and an end postion (end = start + readlength)
+    awk -v readLen=${READLEN} -v OFS='\t' '{print $3,$4,$4+readLen}' $SAMFILE > $AWKTMP
+
+    BED4=${PSAM}/${MYSAMFILE2/_uniq.sam/_uniq_int_all.bed}
+    BED3=$PSAM/${MYSAMFILE2/_uniq.sam/_uniq_int.bed}
+    SUM=${PSAM}/${MYSAMFILE2/_uniq.sam/_drop_summary.txt}
+
+    #Run bedtools intersect with -loj between the reads and the features. 
+    #We will have one result for each region 
+    # pipe to awk to remove rows where a read does NOT overlap with a feature
+    bedtools intersect -a $AWKTMP -b $SBED -loj > ${BED4}
+
+    awk -v OFS='\t' '$4 !="."' ${BED4} > $BED3
+
+    ## create file with counts for before and after dropping
+    awk -v a=0 -v b=${G2}_${FQ} -v OFS=',' 'BEGIN{print "fqName", "number_overlapping_rows", "total_number_rows"delimited}; { if ($4 !=".") a++} END { print b, a, NR}' ${BED4} > ${SUM}
+
+    #With awk substitute column 3 of sam file with column 7 (Feature name) of bed file (using chrom and pos as keys).
+    ##omit reads with no feature assigned
+    awk -v OFS='\t' 'FNR==NR{a[$1,$2]=$7; next} {$3=a[$3,$4]}1' ${BED3} $SAMFILE | awk -F'\t'  '$3!=""' > $NEWSAM 
+
+    echo initial sam file $SAMFILE 
+    echo awk outfile $AWKTMP
+    echo bed intersect outfile $BED3
+    echo new sam file "$NEWSAM"
+
+done 
+
+
+
+## Grab sam files and bed files
+	## sam1 (samA) = G1 and sam2 (samB) = G2
+SAM1=$PSAM/${G1}_${FQ}_upd_uniq_FEATURE.sam
+SAM2=$PSAM/${G2}_${FQ}_upd_uniq_FEATURE.sam
+BED1=$PSAM/${G1}_${FQ}_upd_uniq_int.bed
+BED2=$PSAM/${G2}_${FQ}_upd_uniq_int.bed
+
+awk 'NR==FNR{c[$3]++;next};c[$7] == 0' $SAM1 $BED1 > $CHKSC/check_sam_bed_${G1}_${FQ}.txt
+awk 'NR==FNR{c[$3]++;next};c[$7] == 0' $SAM2 $BED2 > $CHKSC/check_sam_bed_${G2}_${FQ}.txt
+
+###### (3) Run Sam Compare
+
+READ1=${ORIG}/${FQ}${EXT}
+
+echo -e "READ1: '${READ1}"
+echo -e "SAM1: '${SAM1}'"
+echo -e "SAM2: '${SAM2}'"
+echo -e "BED: '${SBED}'"
+
+echo -e "starting sam compare for $FQ "
+## NOTE using average read length!!
+python $SCRIPTS/sam_compare_w_feature.py \
+    -n \
+    -l ${AVE_RL} \
+    -f $BEDFILE \
+    -q $READ1 \
+    -A $SAM1 \
+    -B $SAM2 \
+    -c $SAMC/ase_counts_${FQ}.csv \
+    -t $SAMC/ase_totals_${FQ}.txt \
+    --log $CHKSC/ase_log_${FQ}.log
+
+### mv drop summary to check_aln dir
+echo "
+    Moving drop read summary files to check_aln dir"
+mv ${PSAM}/*_${FQ}_upd_drop_summary.txt ${CHKALN}/
+
+echo -e "run sam compare check for $FQ "
+# Check to make sure counts in csv summary file is within range of minimum unique reads from respective sam files and 
+# the summation of the unique reads of both sam files
+python $SCRIPTS/check_samcomp_for_lost_reads_03amm.py \
+    -b1 $CHKALN/${G1}_${FQ}_upd_drop_summary.txt \
+    -b2 $CHKALN/${G2}_${FQ}_upd_drop_summary.txt \
+    -G1 $G1 \
+    -G2 $G2 \
+    -s $SAMC/ase_totals_${FQ}.txt \
+    -fq $FQ \
+    -o $CHKSC/check_samcomp_${FQ}_aln_2_upd.csv
+
+rm -r $ROZ
+
+
diff --git a/bash/run_ase_bayesian.bash b/bash/run_ase_bayesian.bash
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+
+## 2 conditions - M vs V
+
+## Set / Create Directories and Variables
+SCRIPTS=hpc/stan2_scripts
+
+## Create temp output directory for initial datafile with modified headers before entering Bayesian
+ROZ=example_out/temp_directory
+    mkdir -p $ROZ
+
+## directory for model testing
+BAYES=example_out/bayesian_in
+
+BAYESOUT=example_out/bayesian_out
+    mkdir -p $BAYESOUT
+
+## Set design file with G1, G2, c1, c2, and input filename columns
+DESIGN_FILE=example_in/comparate_design_file.csv
+
+DESIGN=$(sed -n "${SLURM_ARRAY_TASK_ID}p" $DESIGN_FILE)
+IFS=',' read -ra ARRAY <<< "$DESIGN"
+
+COMP_1=${ARRAY[0]}
+COMP_2=${ARRAY[1]}
+COMPID=${ARRAY[2]}
+
+echo "comparate 1 is $COMP_1"
+
+## Set number of comparates to be analyzed
+	## M vs F for each line = 2
+COMPNUM=2
+
+## set number of iterations = 100,000
+ITER=100000
+
+## set burn in of 10,000
+WARMUP=10000
+
+######  Run python script calling environmental bayesian model (stan2, 2 conditions)
+python3 $SCRIPTS/NBmodel_stan2_slurm_02amm.py \
+    -comparate_1 $COMP_1 \
+    -comparate_2 $COMP_2 \
+    -c1_g1 M \
+    -c1_g2 M \
+    -c2_g1 V \
+    -c2_g2 V \
+    -compID $COMPID \
+    -datafile $BAYES \
+    -datafile2 $BAYESOUT/ \
+    -cond $COMPNUM \
+    -workdir . \
+    -routput $BAYESOUT \
+    -subpath $SCRIPTS/NBModel_stan2_flex_prior.R \
+    -iterations $ITER \
+    -warmup $WARMUP \
+    -o $BAYESOUT
diff --git a/bash/run_ase_create_design_file_4_prior_calc.bash b/bash/run_ase_create_design_file_4_prior_calc.bash
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+### create design file for calculating priors from df_ase_samcomp_summed.dsv 
+
+## Set / Create Directories and Variables
+DF=example_out
+DESIGN=example_in/df_ase_samcomp_summed.csv
+
+awk 'BEGIN{FS=","; OFS=","} { print $1, $2, $4}' ${DESIGN} | sort -u > ${DF}/df_priors.csv
diff --git a/bash/run_ase_genotype_specific_references_testData.bash b/bash/run_ase_genotype_specific_references_testData.bash
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+
+## set directories
+## All directories are relative to the main BayesASE folder
+VCFIN=example_in/vcf_by_genotype
+VCFOUT=example_out/vcf_by_genotype
+UREF=example_out/reference/updated_genomes_vcfConsensus
+mkdir -p $VCFOUT
+mkdir -p $UREF
+
+FASTA=example_in/reference/dmel_r551_chromosome_2R_X.fasta
+
+# line,old
+# W55,Winters_55
+# W1118,w1118_w118
+DESIGN_FILE=example_in/df_list_G2_VCF_split.csv
+DESIGN=$(sed -n "${SLURM_ARRAY_TASK_ID}p" $DESIGN_FILE)
+IFS=',' read -ra ARRAY <<< "$DESIGN"
+
+G2=${ARRAY[0]}
+
+#### (1) UPDATE SNPs
+### Index starting VCF files
+
+
+    ## remove and replace if gz file already exists (so we can overwrite)
+    if [[ -f ${VCFOUT}/${G2}_snp_testGenes_sorted.vcf.gz ]] ; then
+        rm ${VCFOUT}/${G2}_snp_testGenes_sorted.vcf.gz
+    fi
+
+    # sort split VCF files
+    bcftools sort -O v ${VCFIN}/${G2}_snp_testGenes.vcf -o ${VCFOUT}/${G2}_snp_testGenes_sorted.vcf
+
+    echo "indexing individual SNP vcf files: ${G2} "
+    bgzip ${VCFOUT}/${G2}_snp_testGenes_sorted.vcf
+    tabix ${VCFOUT}/${G2}_snp_testGenes_sorted.vcf.gz
+
+    echo "updating SNPs: ${G2} "
+    cat ${FASTA} | vcf-consensus ${VCFOUT}/${G2}_snp_testGenes_sorted.vcf.gz > ${UREF}/${G2}_snp_upd_genome.fasta
+
+
+    #### (2) Build indexes from the references
+    echo `date`": building BWA index for ${G2} UPD genome "
+    ##BWA-mem references  -p is the prefix for the output database, -a is the indexing algorithm ('bwtsw' is for ref>2G, 'is' for ref<2G).
+    bwa index -p ${UREF}/${G2}_snp_upd_genome_BWA -a bwtsw ${UREF}/${G2}_snp_upd_genome.fasta
+
+    echo `date`": building samtools index for ${G2} UPD genome"
+    samtools faidx ${UREF}/${G2}_snp_upd_genome.fasta
+
+
+
diff --git a/bash/run_ase_merge_priors_2_comparate_testData.bash b/bash/run_ase_merge_priors_2_comparate_testData.bash
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+## merge priors to comparate
+## merge comparates together
+
+
+SCRIPTS=hpc/ase_scripts
+
+## user must provide following design file for merging comparates:
+DESIGN2=example_in/df_merge_comparates_4_bayesian.csv
+
+DESIGN=example_out/df_priors.csv
+PRIORS=example_out/priors_fromData
+FILT=example_out/ase_counts_summarized
+BAYESIN=example_out/bayesian_in
+    mkdir -p $BAYESIN
+
+    echo "running merge priors to comparate
+    "
+    python3 ${SCRIPTS}/merge_priors_to_comparate_03amm.py \
+        --output ${BAYESIN} \
+        --comp ${FILT} \
+        --prior ${PRIORS} \
+        --design ${DESIGN}
+
+
+    python3 ${SCRIPTS}/merge_comparates_and_gen_headers_for_bayesian_02amm.py \
+    --output ${BAYESIN} \
+    --comp ${BAYESIN} \
+    --design ${DESIGN2}
+
+
diff --git a/bash/run_ase_prior_calculation_testData.bash b/bash/run_ase_prior_calculation_testData.bash
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+### (1) calculate priors from ASE count tables (generated from simulated reads, DNA reads, or RNA data reads)
+
+## Set / Create Directories and Variables
+SCRIPTS=hpc/ase_scripts
+
+## this example calculates from data
+FILT=example_out/ase_counts_summarized
+
+OUTPUT=example_out/priors_fromData
+mkdir -p $OUTPUT
+
+DESIGN=example_out/df_priors.csv
+
+##### (1) calculate priors from ASE counts - data
+
+python3 $SCRIPTS/calculate_priors_from_ase_count_tables_03amm.py \
+    --output $OUTPUT \
+    --design $DESIGN \
+    --input $FILT \
+    --debug