Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
220 changes: 220 additions & 0 deletions bash/run_ase_align_and_count_testData.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
#!/usr/bin/env bash

## Set / Create Directories and Variables
#Everything is relative to current directory
SCRIPTS=hpc/ase_scripts
REF=example_out/reference/updated_genomes_vcfConsensus
ORIG=example_in/reads

BEDFILE=example_in/reformatted_BASE_testdata_bedfile.bed

## Create output directory for alignments to updated genome
OUTALN=example_out/aln_upd_genome
if [ ! -e $OUTALN ]; then mkdir -p $OUTALN; fi
PSAM=example_out/aln_upd_genome_bwa_parse
if [ ! -e $PSAM ]; then mkdir -p $PSAM; fi

## Create output directory for ase counts from sam compare
SAMC=example_out/ase_counts_updated
if [ ! -e $SAMC ]; then mkdir -p $SAMC; fi

## Create output directory for upd aln summaries and checks
CHKALN=example_out/check_aln
if [ ! -e $CHKALN ]; then mkdir -p $CHKALN; fi

## Create output directory for Sam Compare check
CHKSC=example_out/check_samcomp
if [ ! -e $CHKSC ]; then mkdir -p $CHKSC; fi

# G1,G2,sampleID,fqName,fqExtension,readLength,techRep
DESIGN_FILE=example_in/df_BASE_galaxy_W55_noEmptyFq.csv
DESIGN=$(sed -n "${SLURM_ARRAY_TASK_ID}p" $DESIGN_FILE)
IFS=',' read -ra ARRAY <<< "$DESIGN"

G1=${ARRAY[0]}
G2=${ARRAY[1]}
READLEN=${ARRAY[5]}
FQ=${ARRAY[3]}
EXT=${ARRAY[4]}

## Create temp dir
ROZ=roz_${FQ}
mkdir -p ${ROZ}

## set READ and calculate ave readlength
READ=${ORIG}/${FQ}${EXT}
echo " read is $READ
"

AVE_RL=$(awk '{if(NR%4==2 && length) {count++; bases += length} } END {print bases/count}' ${READ} | awk '{ printf "%.0f\\n", $1 }')

###### Create modified BED file to use in SAM compare - this bed has features 1st with start and end positions in genome
## use awk to reorder columms in bed file
SBED=example_out/snp_feature_first.bed

echo "Reformatting $BEDFILE"
awk -v OFS='\t' '{print $4,$2,$3,$1}' $BEDFILE > $SBED


###### (1) Align Reads to Updated Genomes - first to G2 ref then to G1 reference - and Parse sam file
FQLINEFN=$(wc -l $READ)
FQLINE=$(echo $FQLINEFN | cut -d" " -f1)
NUMREAD=$(( FQLINE / 4 ))
FN=$(echo $FQLINEFN | cut -d" " -f2)

## count number of starting reads - same for G1 and G2 refs
echo $NUMREAD | awk -v fq=${FQ} -v gq=pre_aln_read_count '{print "filename" "," gq "\n" fq "," $0}' > $CHKALN/pre_aln_reads_${FQ}.csv



for FOO in G1 G2
do
if [[ $FOO == 'G2' ]]
then
BREF=$REF/${G2}_snp_upd_genome_BWA

echo "Aligning G2 to reference $BREF"
bwa mem -t 8 -M $BREF $READ > $OUTALN/${G2}_${FQ}_upd.sam

echo -e "Start BWASplitSam on: $OUTALN/${G2}_${FQ}_upd.sam"
$SCRIPTS/BWASplitSAM_07mai.py -s $OUTALN/${G2}_${FQ}_upd.sam --outdir $ROZ -fq1 $READ

## cat together mapped and opposite
cat $ROZ/${G2}_${FQ}_upd_mapped.sam $ROZ/${G2}_${FQ}_upd_oposite.sam > $PSAM/${G2}_${FQ}_upd_uniq.sam

elif [[ $FOO == 'G1' ]]
then
BREF=$REF/${G1}_snp_upd_genome_BWA

echo "Aligning G1 to reference $BREF"
bwa mem -t 8 -M $BREF $READ > $OUTALN/${G1}_${FQ}_upd.sam

echo -e "Start BWASplitSam on: $OUTALN/${G1}_${FQ}_upd.sam"
$SCRIPTS/BWASplitSAM_07mai.py -s $OUTALN/${G1}_${FQ}_upd.sam --outdir $ROZ -fq1 $READ

## cat together mapped and opposite
cat $ROZ/${G1}_${FQ}_upd_mapped.sam $ROZ/${G1}_${FQ}_upd_oposite.sam > $PSAM/${G1}_${FQ}_upd_uniq.sam
fi
done

### move alignment summary csv files
mv ${ROZ}/${G1}_${FQ}_upd_summary.csv ${CHKALN}/${G1}_${FQ}_upd_summary.csv
mv ${ROZ}/${G2}_${FQ}_upd_summary.csv ${CHKALN}/${G2}_${FQ}_upd_summary.csv

### for every FQ file run, should have 2 sam files (NOTE default is SE)
echo "
checking for 2 sam files"

python $SCRIPTS/check_sam_present_04amm.py \
-fq $FQ \
-alnType SE \
-samPath $PSAM \
-G1 $G1 \
-G2 $G2 \
-o $CHKALN/check_2_sam_files_${FQ}.txt

## run python script to count reads into aln and in each SAM file
echo "
checking for missing reads in sam files"

python $SCRIPTS/check_for_lost_reads_05amm.py \
-a1 $CHKALN/${G1}_${FQ}_upd_summary.csv \
-a2 $CHKALN/${G2}_${FQ}_upd_summary.csv \
-numread $CHKALN/pre_aln_reads_${FQ}.csv \
-fq $FQ \
-o $CHKALN/check_start_reads_vs_aln_reads_${FQ}.csv


## Insert bedtools intersect script + any checks (reads in aln sam output)
###### (2) Bedtools Intersect: Here we will call the shell script to reformat the sam file so that the have feature names instead of CHR names
## In parsed SAM,

for SAMFILE in $PSAM/*_${FQ}_upd_uniq.sam
do
MYSAMFILE2=$(basename $SAMFILE)

AWKTMP=$PSAM/${MYSAMFILE2/_uniq.sam/_uniq_AWK.txt}
NEWSAM=$PSAM/${MYSAMFILE2/_uniq.sam/_uniq_FEATURE.sam}

#Create a bed file to write the starting position of every read and an end postion (end = start + readlength)
awk -v readLen=${READLEN} -v OFS='\t' '{print $3,$4,$4+readLen}' $SAMFILE > $AWKTMP

BED4=${PSAM}/${MYSAMFILE2/_uniq.sam/_uniq_int_all.bed}
BED3=$PSAM/${MYSAMFILE2/_uniq.sam/_uniq_int.bed}
SUM=${PSAM}/${MYSAMFILE2/_uniq.sam/_drop_summary.txt}

#Run bedtools intersect with -loj between the reads and the features.
#We will have one result for each region
# pipe to awk to remove rows where a read does NOT overlap with a feature
bedtools intersect -a $AWKTMP -b $SBED -loj > ${BED4}

awk -v OFS='\t' '$4 !="."' ${BED4} > $BED3

## create file with counts for before and after dropping
awk -v a=0 -v b=${G2}_${FQ} -v OFS=',' 'BEGIN{print "fqName", "number_overlapping_rows", "total_number_rows"delimited}; { if ($4 !=".") a++} END { print b, a, NR}' ${BED4} > ${SUM}

#With awk substitute column 3 of sam file with column 7 (Feature name) of bed file (using chrom and pos as keys).
##omit reads with no feature assigned
awk -v OFS='\t' 'FNR==NR{a[$1,$2]=$7; next} {$3=a[$3,$4]}1' ${BED3} $SAMFILE | awk -F'\t' '$3!=""' > $NEWSAM

echo initial sam file $SAMFILE
echo awk outfile $AWKTMP
echo bed intersect outfile $BED3
echo new sam file "$NEWSAM"

done



## Grab sam files and bed files
## sam1 (samA) = G1 and sam2 (samB) = G2
SAM1=$PSAM/${G1}_${FQ}_upd_uniq_FEATURE.sam
SAM2=$PSAM/${G2}_${FQ}_upd_uniq_FEATURE.sam
BED1=$PSAM/${G1}_${FQ}_upd_uniq_int.bed
BED2=$PSAM/${G2}_${FQ}_upd_uniq_int.bed

awk 'NR==FNR{c[$3]++;next};c[$7] == 0' $SAM1 $BED1 > $CHKSC/check_sam_bed_${G1}_${FQ}.txt
awk 'NR==FNR{c[$3]++;next};c[$7] == 0' $SAM2 $BED2 > $CHKSC/check_sam_bed_${G2}_${FQ}.txt

###### (3) Run Sam Compare

READ1=${ORIG}/${FQ}${EXT}

echo -e "READ1: '${READ1}"
echo -e "SAM1: '${SAM1}'"
echo -e "SAM2: '${SAM2}'"
echo -e "BED: '${SBED}'"

echo -e "starting sam compare for $FQ "
## NOTE using average read length!!
python $SCRIPTS/sam_compare_w_feature.py \
-n \
-l ${AVE_RL} \
-f $BEDFILE \
-q $READ1 \
-A $SAM1 \
-B $SAM2 \
-c $SAMC/ase_counts_${FQ}.csv \
-t $SAMC/ase_totals_${FQ}.txt \
--log $CHKSC/ase_log_${FQ}.log

### mv drop summary to check_aln dir
echo "
Moving drop read summary files to check_aln dir"
mv ${PSAM}/*_${FQ}_upd_drop_summary.txt ${CHKALN}/

echo -e "run sam compare check for $FQ "
# Check to make sure counts in csv summary file is within range of minimum unique reads from respective sam files and
# the summation of the unique reads of both sam files
python $SCRIPTS/check_samcomp_for_lost_reads_03amm.py \
-b1 $CHKALN/${G1}_${FQ}_upd_drop_summary.txt \
-b2 $CHKALN/${G2}_${FQ}_upd_drop_summary.txt \
-G1 $G1 \
-G2 $G2 \
-s $SAMC/ase_totals_${FQ}.txt \
-fq $FQ \
-o $CHKSC/check_samcomp_${FQ}_aln_2_upd.csv

rm -r $ROZ


57 changes: 57 additions & 0 deletions bash/run_ase_bayesian.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/env bash

## 2 conditions - M vs V

## Set / Create Directories and Variables
SCRIPTS=hpc/stan2_scripts

## Create temp output directory for initial datafile with modified headers before entering Bayesian
ROZ=example_out/temp_directory
mkdir -p $ROZ

## directory for model testing
BAYES=example_out/bayesian_in

BAYESOUT=example_out/bayesian_out
mkdir -p $BAYESOUT

## Set design file with G1, G2, c1, c2, and input filename columns
DESIGN_FILE=example_in/comparate_design_file.csv

DESIGN=$(sed -n "${SLURM_ARRAY_TASK_ID}p" $DESIGN_FILE)
IFS=',' read -ra ARRAY <<< "$DESIGN"

COMP_1=${ARRAY[0]}
COMP_2=${ARRAY[1]}
COMPID=${ARRAY[2]}

echo "comparate 1 is $COMP_1"

## Set number of comparates to be analyzed
## M vs F for each line = 2
COMPNUM=2

## set number of iterations = 100,000
ITER=100000

## set burn in of 10,000
WARMUP=10000

###### Run python script calling environmental bayesian model (stan2, 2 conditions)
python3 $SCRIPTS/NBmodel_stan2_slurm_02amm.py \
-comparate_1 $COMP_1 \
-comparate_2 $COMP_2 \
-c1_g1 M \
-c1_g2 M \
-c2_g1 V \
-c2_g2 V \
-compID $COMPID \
-datafile $BAYES \
-datafile2 $BAYESOUT/ \
-cond $COMPNUM \
-workdir . \
-routput $BAYESOUT \
-subpath $SCRIPTS/NBModel_stan2_flex_prior.R \
-iterations $ITER \
-warmup $WARMUP \
-o $BAYESOUT
9 changes: 9 additions & 0 deletions bash/run_ase_create_design_file_4_prior_calc.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/usr/bin/env bash

### create design file for calculating priors from df_ase_samcomp_summed.dsv

## Set / Create Directories and Variables
DF=example_out
DESIGN=example_in/df_ase_samcomp_summed.csv

awk 'BEGIN{FS=","; OFS=","} { print $1, $2, $4}' ${DESIGN} | sort -u > ${DF}/df_priors.csv
51 changes: 51 additions & 0 deletions bash/run_ase_genotype_specific_references_testData.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env bash

## set directories
## All directories are relative to the main BayesASE folder
VCFIN=example_in/vcf_by_genotype
VCFOUT=example_out/vcf_by_genotype
UREF=example_out/reference/updated_genomes_vcfConsensus
mkdir -p $VCFOUT
mkdir -p $UREF

FASTA=example_in/reference/dmel_r551_chromosome_2R_X.fasta

# line,old
# W55,Winters_55
# W1118,w1118_w118
DESIGN_FILE=example_in/df_list_G2_VCF_split.csv
DESIGN=$(sed -n "${SLURM_ARRAY_TASK_ID}p" $DESIGN_FILE)
IFS=',' read -ra ARRAY <<< "$DESIGN"

G2=${ARRAY[0]}

#### (1) UPDATE SNPs
### Index starting VCF files


## remove and replace if gz file already exists (so we can overwrite)
if [[ -f ${VCFOUT}/${G2}_snp_testGenes_sorted.vcf.gz ]] ; then
rm ${VCFOUT}/${G2}_snp_testGenes_sorted.vcf.gz
fi

# sort split VCF files
bcftools sort -O v ${VCFIN}/${G2}_snp_testGenes.vcf -o ${VCFOUT}/${G2}_snp_testGenes_sorted.vcf

echo "indexing individual SNP vcf files: ${G2} "
bgzip ${VCFOUT}/${G2}_snp_testGenes_sorted.vcf
tabix ${VCFOUT}/${G2}_snp_testGenes_sorted.vcf.gz

echo "updating SNPs: ${G2} "
cat ${FASTA} | vcf-consensus ${VCFOUT}/${G2}_snp_testGenes_sorted.vcf.gz > ${UREF}/${G2}_snp_upd_genome.fasta


#### (2) Build indexes from the references
echo `date`": building BWA index for ${G2} UPD genome "
##BWA-mem references -p is the prefix for the output database, -a is the indexing algorithm ('bwtsw' is for ref>2G, 'is' for ref<2G).
bwa index -p ${UREF}/${G2}_snp_upd_genome_BWA -a bwtsw ${UREF}/${G2}_snp_upd_genome.fasta

echo `date`": building samtools index for ${G2} UPD genome"
samtools faidx ${UREF}/${G2}_snp_upd_genome.fasta



32 changes: 32 additions & 0 deletions bash/run_ase_merge_priors_2_comparate_testData.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env bash

## merge priors to comparate
## merge comparates together


SCRIPTS=hpc/ase_scripts

## user must provide following design file for merging comparates:
DESIGN2=example_in/df_merge_comparates_4_bayesian.csv

DESIGN=example_out/df_priors.csv
PRIORS=example_out/priors_fromData
FILT=example_out/ase_counts_summarized
BAYESIN=example_out/bayesian_in
mkdir -p $BAYESIN

echo "running merge priors to comparate
"
python3 ${SCRIPTS}/merge_priors_to_comparate_03amm.py \
--output ${BAYESIN} \
--comp ${FILT} \
--prior ${PRIORS} \
--design ${DESIGN}


python3 ${SCRIPTS}/merge_comparates_and_gen_headers_for_bayesian_02amm.py \
--output ${BAYESIN} \
--comp ${BAYESIN} \
--design ${DESIGN2}


22 changes: 22 additions & 0 deletions bash/run_ase_prior_calculation_testData.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env bash

### (1) calculate priors from ASE count tables (generated from simulated reads, DNA reads, or RNA data reads)

## Set / Create Directories and Variables
SCRIPTS=hpc/ase_scripts

## this example calculates from data
FILT=example_out/ase_counts_summarized

OUTPUT=example_out/priors_fromData
mkdir -p $OUTPUT

DESIGN=example_out/df_priors.csv

##### (1) calculate priors from ASE counts - data

python3 $SCRIPTS/calculate_priors_from_ase_count_tables_03amm.py \
--output $OUTPUT \
--design $DESIGN \
--input $FILT \
--debug
Loading