-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPhylogenetic_Community_Structure_Analysis.r
749 lines (637 loc) · 33.6 KB
/
Phylogenetic_Community_Structure_Analysis.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
#Pipeline for phylogenetic community structure analysis
#Centroid and alignment code adapted from Matthew Orton https://github.com/m-orton/Evolutionary-Rates-Analysis-Pipeline/blob/master/EvolutionaryComparisonPipelineSmallTaxa.R
#Sequence trimming and outlier removal functions adapted from Jacqueline May https://github.com/jmay29/phylo/blob/master/refSeqTrim.R
#Script by Sam Majoros. Email: [email protected]
#Thank you to Cameron Nugent, Jacqueline May, and Matthew Orton for help with the code design.
#Thank you to Sarah Adamowicz for help designing the project and coauthoring the paper.
#Thank you to Alex Smith and Kamil Chatila-Amos for helpful project disscussions.
#Part 1: Inputting and Filtering Data ----
#This sections includes steps to load the needed packages, load the data into R, and to filter the data.
#Packages needed for this analysis
#If you do not already have these packages, uncomment the code and install.
#install.packages("readr")
library(readr)
#install.packages("plyr")
library(plyr)
#install.packages("dplyr")
library(dplyr)
#install.packages("foreach")
library(foreach)
#install.packages("tidyverse")
library(tidyverse)
#install.packages("stringr")
library(stringr)
#install.packages("stringi")
library(stringi)
#install.packages("ape")
library(ape)
#install.packages("BiocManager")
#BiocManager::install(c("Biostrings", "muscle"))
library(Biostrings)
library(muscle)
#install.packages("phangorn")
library(phangorn)
#install.packages("picante")
library(picante)
#install.packages("data.table")
library(data.table)
#install.packages("phytools")
library(phytools)
#Upload order data into R
#Uncomment the following code to download data directly from BOLD, specifying the required geographical locations. The data I used was retrieved on June 19th, 2019.
#dfOrder <- read_tsv("http://www.boldsystems.org/index.php/API_Public/combined?taxon=Coleoptera&geo=Alaska|Canada&format=tsv")
#Write file to hard disk
#write_tsv(dfOrder, "Coleoptera_download_June19")
#Read in saved order data
dfOrder <- read_tsv("Coleoptera_download_June19")
#Filtering the data
dfOrder <- dfOrder %>%
#Filter out those without bin_uri
filter(str_detect(bin_uri, ":")) %>%
#Filter out those without a sequence
filter(str_detect(nucleotides, "[ACTG]")) %>%
#Filter for COI-5P
filter(markercode == "COI-5P") %>%
#Filter out sequences with fewer than 500 base pairs
filter(nchar(gsub("-", "", nucleotides)) > 499) %>%
#Filter out records without a family name
filter(!is.na(family_name))
#Filter out high gap/N content. A threshold of 1% was chosen because species often differ by more than 2% divergence. By filtering out records with > 1% N and gap content, we are likely to get a high-quality data set, given typical patterns of variability in COI in animals.
startNGap <- sapply(regmatches(dfOrder$nucleotides, gregexpr("^[-N]", dfOrder$nucleotides)), length)
startNGap <- foreach(i=1:nrow(dfOrder)) %do%
if (startNGap[[i]]>0) {
split <- strsplit(dfOrder$nucleotides[i], "^[-N]+")
dfOrder$nucleotides[i] <- split[[1]][2]
}
endNGap <- sapply(regmatches(dfOrder$nucleotides, gregexpr("[-N]$", dfOrder$nucleotides)), length)
endNGap <- foreach(i=1:nrow(dfOrder)) %do%
if (endNGap[[i]]>0) {
split <- strsplit(dfOrder$nucleotides[i], "[-N]+$")
dfOrder$nucleotides[i] <- split[[1]][1]
}
internalNGap <- sapply(regmatches(dfOrder$nucleotides, gregexpr("[-N]", dfOrder$nucleotides)), length)
internalNGap <- foreach(i=1:nrow(dfOrder)) %do%
which((internalNGap[[i]]/nchar(dfOrder$nucleotides[i]) > 0.01))
nGapCheck <- sapply(internalNGap, function(x)length(x))
nGapCheck <- which(nGapCheck>0)
dfOrder <- dfOrder[-nGapCheck, ]
#Remove redundant "BOLD:" component from each BIN uri in the BIN column
dfOrder$bin_uri <- substr(dfOrder$bin_uri, 6, 13)
#Filter out sequences without coordinates
containLatLon <- grep ("[0-9]", dfOrder$lat)
dfOrder <- dfOrder[containLatLon, ]
#Create subset filter using coordinates
#Filter for Churchill, Manitoba
SubsetFilter_Churchill <- which(dfOrder$lat > 58.6 &
dfOrder$lon > -94.2 & dfOrder$lat < 58.7 &
dfOrder$lon < -93.8)
#Apply filter
dfOrder_Churchill <- dfOrder[SubsetFilter_Churchill, ]
#Find total number of BINs per family in the regional subset of the order
#First convert to datatable
dfOrder_Churchill <- as.data.table(dfOrder_Churchill)
#create datatable showing number of sequences per family
total_species_number <- dfOrder_Churchill[ , .(.N),by=.(family_name)]
#create datatable showing the number of BINs per family
number_of_unique_species <- dfOrder_Churchill[ , .(number_of_species=length(unique(bin_uri))), by=family_name]
#convert to dataframe
number_of_unique_species <- as.data.frame(number_of_unique_species)
#Filter down to families with 3 or more species
number_of_unique_species <- filter(number_of_unique_species, number_of_unique_species$number_of_species > 2)
#Create filter to filter down the order to families with three or more species in the subset
dfOrder_filter <- which(dfOrder$family_name %in% number_of_unique_species$family_name)
#Apply filter
dfOrder <- dfOrder[dfOrder_filter, ]
#Here I save the filtered data as a file. This data is provided on Github as the original download is too large.
#To read in the pre filtered data, uncomment the following code.
#dfOrder <- read_tsv("Coleoptera_download_post_filtering")
#Remove unneeded variables
rm(number_of_unique_species, total_species_number, SubsetFilter_Churchill, containLatLon, endNGap, internalNGap, nGapCheck, split, startNGap, dfOrder_filter, i)
#Part 2: Choosing a Centroid----
#In this section we find a centroid sequence for each BIN present in the Canada and Alaska dataset. This includes running a multiple sequence alignment and computing a distance matrix.
#Create smaller dataframe with needed info
dfBinList <- (dfOrder[, c("processid", "bin_uri", "nucleotides")])
#Create groupings by BIN, each with different bin_uri
binList <- lapply(unique(dfOrder$bin_uri), function(x) dfOrder[dfOrder$bin_uri==x, ])
#Find the number of processids in each bin
binSize <- sapply(binList, function(x)length(x$processid))
#Create new data frame with bin_uri and bin size
dfOrder_bins <- data.frame(binSize)
dfOrder_bins$bin_uri <- c(unique(dfOrder$bin_uri))
#Merge dfBinList and dfOrder_bins
dfBinList <- merge(dfBinList, dfOrder_bins, by.x="bin_uri", by.y="bin_uri")
#Reorder dfFamily_bins by bin_uri
dfOrder_bins <- dfOrder_bins[order(dfOrder_bins$bin_uri), ]
#Find BINs with more than one member
largeBin <- which(dfBinList$binSize > 1)
#Create dataframe with only bins with more than one member
if (length(largeBin) > 0) {
dfCentroid <- dfBinList[largeBin, ]
}
#Subset dfOrder_bins down to number of BINs in dfOrder
dfOrder_bins <- subset(dfOrder_bins, dfOrder_bins$bin_uri %in% dfCentroid$bin_uri)
#Find number of unique BINs in dfCentroid
binNumberCentroid <- unique(dfCentroid$bin_uri)
binNumberCentroid <- length(binNumberCentroid)
#Create dataframe with BINs with only one sequence
dfNonCentroid <- dfBinList[-largeBin, ]
#Create list from dfCentroid
largeBinList <- lapply(unique(dfCentroid$bin_uri), function(x) dfCentroid[dfCentroid$bin_uri == x, ])
#Extract process Id from each bin
largeBinProcessid <- sapply(largeBinList, function(x) (x$processid))
#Convert sequences to dnaStringSet
dnaStringSet1 <- sapply(largeBinList, function(x) DNAStringSet(x$nucleotides))
#Name dnaStringSet with processids
for(i in seq(from=1, to=binNumberCentroid, by=1)) {
names(dnaStringSet1[[i]]) <- largeBinProcessid[[i]]
}
#Run multiple sequence alignment for sequences in each BIN in dnaStringSet1
alignment1 <- foreach(i=1:binNumberCentroid) %do%
muscle::muscle(dnaStringSet1[[i]], maxiters=3, diags=TRUE, gapopen=-3000)
#Convert to DNAbin format
dnaBINCentroid <- foreach(i=1:binNumberCentroid) %do% as.DNAbin(alignment1[[i]])
#Calculate a pairwise distance matrix for each BIN
geneticDistanceCentroid <- foreach(i=1:binNumberCentroid) %do%
dist.dna(dnaBINCentroid[[i]], model="TN93", as.matrix = TRUE,
pairwise.deletion = TRUE)
#Determine centroid sequence; The sequence with the minimum average distance to all other sequences in the BIN.
centroidSeq <- foreach(i=1:binNumberCentroid) %do% which.min(rowSums(geneticDistanceCentroid[[i]]))
centroidSeq <- centroidSeq %>%
unlist() %>%
names()
#Subset dfCentroid by the processid on the list
dfCentroid <- subset(dfCentroid, processid %in% centroidSeq)
#Merge with dfNonCentroid
dfAllSeq <- rbind(dfCentroid, dfNonCentroid)
#Merge with the original data set
dfAllSeq <- merge(dfAllSeq, dfOrder, by.x="processid", by.y="processid")
#Reorganize and clean up
dfAllSeq <- (dfAllSeq[, c("bin_uri.x", "binSize", "processid", "family_taxID", "family_name", "species_taxID", "species_name", "nucleotides.x", "lat", "lon", "subfamily_name", "order_name")])
colnames(dfAllSeq)[1] <- "bin_uri"
colnames(dfAllSeq)[8] <- "nucleotides"
#Delete any possible duplicate entries
dfAllSeq <- (by(dfAllSeq, dfAllSeq["bin_uri"], head, n=1))
dfAllSeq <- Reduce(rbind, dfAllSeq)
#Add an index column
dfAllSeq$ind <- row.names(dfAllSeq)
#Here I saved dfAllSeq to the file. This Centroid step takes a long time to complete, so the data post centroid is provided on Github
#If you want to use the post centroid data, uncomment and run the following code.
#dfAllSeq <- read_tsv("Post_Centroid")
#Remove unneeded dataframes and variables
rm(alignment1, binList, dfBinList, dfCentroid, dfOrder_bins, dfNonCentroid, dnaBINCentroid, dnaStringSet1, geneticDistanceCentroid, largeBinList, largeBinProcessid, binNumberCentroid, binSize, centroidSeq, i, largeBin)
#Part 3: Alignment----
#This sections includes code to trim the sequences and run a mutliple sequence alignment. The first alignment is a preliminary alignment that allows the sequences to be trimmed. The second is the final alignment once the outgroups have been added. Outgroups for each family were chosen from a different suborder of Coleoptera.
#Create a function to trim the sequences
RefSeqTrim <- function(x) {
#Create data frame for reference sequence
#This reference sequence was taken from BOLD for Coleoptera. Process id: AEDNA549-12. Species:Colymbetes dolabratus.
dfRefSeq <- data.frame(taxa=c("Coleoptera"), nucleotides=c("TAACTTTATATTTTATTTTTGGTGCATGGGCTGGAATGGTAGGAACATCTTTAAGTATGTTGATTCGAGCCGAATTAGGAAATCCTGGTTCTCTGATTGGAGATGATCAAATTTATAATGTTATTGTAACAGCACATGCTTTTGTAATAATTTTTTTCATAGTAATACCTATTATAATTGGGGGATTTGGAAATTGATTAGTTCCATTAATATTGGGGGCCCCAGATATAGCTTTTCCCCGAATAAATAATATAAGTTTTTGACTTCTTCCGCCTTCTTTAACTCTTCTATTAATAAGAAGAATAGTTGAAAGTGGGGCCGGGACAGGATGAACAGTTTACCCCCCTCTATCTTCAGGAATTGCACACGGAGGAGCTTCAGTTGATCTAGCAATTTTTAGTCTTCATTTAGCTGGAATTTCATCTATTTTAGGGGCTGTAAATTTCATTACAACTATTATTAATATACGATCAGTGGGAATAACATTCGACCGAATGCCTCTATTTGTATGATCCGTAGGAATTACAGCTTTATTACTATTATTATCTTTACCTGTATTAGCGGGAGCTATTACTATATTATTAACTGATCGTAATCTAAACACCTCATTCTTCGACCCGGCAGGAGGGGGAGATCCAATTTTATATCAACATTTATT"))
colnames(dfRefSeq)[2] <- "nucleotides"
#Convert to datatable
dfRefSeq <- setDT(dfRefSeq)
dfRefSeq[, "nucleotides":=as.character(nucleotides)]
#Trim sequences to 620bp
dfRefSeq[, nucleotides:=substr(nucleotides, 20, nchar(nucleotides)-19)]
#Check sequence length
dfRefSeq[, seqLength:=nchar(nucleotides)]
#Ensure sequences are of character type
alignmentSeqs <- as.character(x$nucleotides)
#Name according to bin_uri
names(alignmentSeqs) <- x$bin_uri
alignmentref <- as.character(dfRefSeq$nucleotides[1])
#Name reference sequence
names(alignmentref) <- "Reference"
#Put sequences together
alignmentSeqsPlusRef <- append(alignmentref, alignmentSeqs)
#Convert to DNAStringSet
DNAStringSet2 <- DNAStringSet(alignmentSeqsPlusRef)
#Run alignment
alignment2 <- muscle::muscle(DNAStringSet2, diags=TRUE, gapopen=-3000)
#Check alignment
classFileNames <- foreach(i=1:nrow(dfRefSeq)) %do%
paste("alignmentUntrimmed", dfRefSeq$taxa[i], ".fas", sep="")
alignmentUntrimmed <- DNAStringSet(alignment2)
writeXStringSet(alignmentUntrimmed, file=classFileNames[[1]],
format = "fasta", width=1500)
#Find stop and start positions in reference
refSeqPos <- which(alignment2@unmasked@ranges@NAMES=="Reference")
refSeqPos <- alignment2@unmasked[refSeqPos]
refSeqPosStart <- regexpr("[ACTG]", refSeqPos)
refSeqPosStart <- as.numeric(refSeqPosStart)
refSeqPosEnd <- nchar(dfRefSeq$nucleotides[1]) + refSeqPosStart
refSeqPosEnd <- as.numeric(refSeqPosEnd)
#Trim sequence
alignment2Trimmed <- substr(alignment2, refSeqPosStart, refSeqPosEnd)
#Convert to DNAStringSet
DNAStringSet3 <- DNAStringSet(alignment2Trimmed)
#Check alignment
classFileNames <- foreach(i=1:nrow(dfRefSeq)) %do%
paste("alignmentTrimmed", dfRefSeq$taxa[i], ".fas", sep="")
writeXStringSet(DNAStringSet3, file=classFileNames[[1]],
format = "fasta", width=1500)
#Remove reference sequence
refSeqRm <- which(DNAStringSet3@ranges@NAMES=="Reference")
dnaStringSet3 <- subset(DNAStringSet3[-refSeqRm])
alignmentOrder <- DNAStringSet3@ranges@NAMES
#Reorder based on alignment
x <- x[match(alignmentOrder, x$bin_uri), ]
#Replace old sequences with new ones
trimmedSeqs <- as.character(DNAStringSet3)
x$nucleotides <- trimmedSeqs
#Return datafrmae with new sequences
return(x)
}
#Trim centroid sequences to reference sequence
dfAllSeq2 <- RefSeqTrim(dfAllSeq)
#Convert sequences to DNAbin format
DNABin <- DNAStringSet(dfAllSeq2$nucleotides)
names(DNABin) <- dfAllSeq2$bin_uri
DNABin <- as.DNAbin(DNABin)
#Construct a distance matrix
distanceMatrix <- dist.dna(DNABin, model="TN93", as.matrix = TRUE, pairwise.deletion = TRUE)
#Visualize the values in the distance matrix using a histogram
hist(distanceMatrix)
#Using upper threshold of IQR to detect outliers
lowerQuantile <- quantile(distanceMatrix)[2]
upperQuantile <- quantile(distanceMatrix)[4]
iqr <-upperQuantile - lowerQuantile
#Set threshold to 1.5. In order to only remove extreme outliers this can be change to 3.
upperThreshold <- (iqr*1.5) + upperQuantile
#Remove 0 values
distanceMatrix[distanceMatrix==0] <- NA
#Convert to data table
dfOutliers <- as.data.table(distanceMatrix, keep.rownames = T)
#Change the "rn" column to bin_uri
setnames(dfOutliers, "rn", "bin_uri")
#Identify divergent BINs
dfOutliers <- dfOutliers[, outlier := apply(.SD, 1, function(x)all(x>upperThreshold, na.rm=T))][outlier==TRUE]
#Create remove sequences function
RemoveSequences<-function(x, y){
if(length(y)==0){
print("There are no sequences to remove!")
}
else if(length(y)>0){
x <- x[!x$bin_uri%in%y]
}
return(x)
}
#Remove outliers
#Outliers should be blasted prior to removal
dfAllSeq <- RemoveSequences(dfAllSeq, dfOutliers$bin_uri)
#Create final alignment of sequences
#Create RefSeq data frame
#Sequence was taken from BOLD and manually put in
dfRefSeq <- data.frame(taxa=c("Coleoptera"), nucleotides=c("TAACTTTATATTTTATTTTTGGTGCATGGGCTGGAATGGTAGGAACATCTTTAAGTATGTTGATTCGAGCCGAATTAGGAAATCCTGGTTCTCTGATTGGAGATGATCAAATTTATAATGTTATTGTAACAGCACATGCTTTTGTAATAATTTTTTTCATAGTAATACCTATTATAATTGGGGGATTTGGAAATTGATTAGTTCCATTAATATTGGGGGCCCCAGATATAGCTTTTCCCCGAATAAATAATATAAGTTTTTGACTTCTTCCGCCTTCTTTAACTCTTCTATTAATAAGAAGAATAGTTGAAAGTGGGGCCGGGACAGGATGAACAGTTTACCCCCCTCTATCTTCAGGAATTGCACACGGAGGAGCTTCAGTTGATCTAGCAATTTTTAGTCTTCATTTAGCTGGAATTTCATCTATTTTAGGGGCTGTAAATTTCATTACAACTATTATTAATATACGATCAGTGGGAATAACATTCGACCGAATGCCTCTATTTGTATGATCCGTAGGAATTACAGCTTTATTACTATTATTATCTTTACCTGTATTAGCGGGAGCTATTACTATATTATTAACTGATCGTAATCTAAACACCTCATTCTTCGACCCGGCAGGAGGGGGAGATCCAATTTTATATCAACATTTATT"))
#name nucleotide column and set as character
colnames(dfRefSeq)[2] <- "nucleotides"
dfRefSeq$nucleotides <- as.character(dfRefSeq$nucleotides)
#Trim references to standard 620
dfRefSeq$nucleotides <- substr(dfRefSeq$nucleotides, 20, nchar(dfRefSeq$nucleotides)-19)
#Check sequence length
dfRefSeq$seqLength <- nchar(dfRefSeq$nucleotides)
#Subset centroid sequences by those found in reference sequence dataframe
dfAllSeq <- subset(dfAllSeq, dfAllSeq$order_name %in% dfRefSeq$taxa)
#Break down dataframe into families
taxalistcomplete <- lapply(unique(dfAllSeq$family_taxID), function(x) dfAllSeq[dfAllSeq$family_taxID==x, ])
#Create new dataframes that sort families by suborder
dfAdephaga <- rbind(taxalistcomplete[[1]], taxalistcomplete[[3]], taxalistcomplete[[10]], taxalistcomplete[[12]])
dfPolyphaga <- rbind(taxalistcomplete[[2]], taxalistcomplete[[4]], taxalistcomplete[[5]], taxalistcomplete[[6]], taxalistcomplete[[7]],taxalistcomplete[[8]],taxalistcomplete[[9]],taxalistcomplete[[11]],taxalistcomplete[[13]],taxalistcomplete[[14]],taxalistcomplete[[15]],taxalistcomplete[[16]])
#Randomly select outgroup sequences
dfAdephaga_Outgroup <- sample_n(dfAdephaga, 1)
dfPolyphaga_outgroup <- sample_n(dfPolyphaga, 1)
#Put outgroup dataframes into a list
Suborders <- list(dfPolyphaga_outgroup, dfAdephaga_Outgroup)
dataframeNums <- c(1, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2)
#Creating a vector of the outgroup names.
outgroupNames <- c("Outgroup_Car", "Outgroup_Cur", "Outgroup_Dyt", "Outgroup_Cocc", "Outgroup_Lei", "Outgroup_Chry", "Outgroup_Staph", "Outgroup_Bup", "Outgroup_Hydro", "Outgroup_Hal", "Outgroup_Can", "Outgroup_Gyr", "Outgroup_Ela", "Outgroup_Crypt", "Outgroup_Scirt", "Outgroup_Lat")
l_outgroups <- list()
#Use a for loop to sample the sequences from taxalistcomplete.
for (i in 1:length(dataframeNums)) {
#Take the ith dataframe number.
number <- dataframeNums[[i]]
#Sample from the corresponding dataframe in taxalistcomplete.
outgroupSeqs <- Suborders[[number]]
#Append outgroupSeqs to the list we created earlier.
l_outgroups[[i]] <- outgroupSeqs
}
names(l_outgroups) <- outgroupNames
taxalistcompleteWO <- list()
#Now let's add the outgroup sequences to the corresponding dataframes taxalistcomplete.
for (i in 1:length(taxalistcomplete)) {
#Take the ith outgroupseqs.
outgroupSeqs <- l_outgroups[[i]]
#Take the ith taxa from taxalistcomplete.
taxa <- taxalistcomplete[[i]]
#Combine them using rbind
dfCombined <- rbind(taxa, outgroupSeqs)
#Append to our list.
taxalistcompleteWO[[i]] <- dfCombined
}
#Trim sequences again now that outgroups have been added
taxalistcompleteWO <- lapply(taxalistcompleteWO, RefSeqTrim)
#Extract sequences and bin_uri
familyBin <- foreach(i=1:length(taxalistcompleteWO)) %do% taxalistcompleteWO[[i]]$bin_uri
familySequences <- foreach(i=1:length(taxalistcompleteWO)) %do% taxalistcompleteWO[[i]]$nucleotides
familySequenceNames <- familyBin
#Take reference sequences
alignmentref <- as.character(dfRefSeq$nucleotides)
dfRefSeq$reference <- "reference"
#Name reference as a reference
alignmentRefNames <- dfRefSeq$reference
#Merge reference with other sequences
alignmentSequencesPlusRef <- foreach(i=1:length(taxalistcompleteWO)) %do%
append(familySequences[[i]], alignmentref[[1]])
#Merge names together
alignmentNames <- foreach(i=1:length(taxalistcompleteWO)) %do%
append(familySequenceNames[[i]], alignmentRefNames[[1]])
#Convert sequences to DNAStringSet format
dnaStringSet3 <- foreach(i=1:length(alignmentSequencesPlusRef)) %do%
DNAStringSet(alignmentSequencesPlusRef[[i]])
#Name each sequence
for(i in 1:16){
names(dnaStringSet3[[i]]) = alignmentNames[[i]]
}
#Multiple sequence alignment
alignmentFinal <- foreach(i=1:length(dnaStringSet3)) %do%
muscle(dnaStringSet3[[i]], diags=TRUE, gapopen=-3000)
#Check Alignment
familyFileNames2 <- foreach(i=1:length(alignmentFinal)) %do%
paste("alignmentFinal", dfRefSeq$taxa[i], ".fas", sep="")
alignmentFinalFasta <- foreach(i=1:length(alignmentFinal)) %do%
DNAStringSet(alignmentFinal[[i]])
foreach(i=1:length(alignmentFinal)) %do%
writeXStringSet(alignmentFinalFasta[[i]], file=familyFileNames2[[i]], format="fasta", width=1500)
#Convert to dnaStringSet format
dnaStringSet4 <- foreach(i=1:length(alignmentFinal)) %do%
DNAStringSet(alignmentFinal[[i]])
#Remove unneeded info
rm(alignmentFinal, alignmentNames, alignmentSequencesPlusRef, dnaStringSet3, familyBin, alignmentref, alignmentRefNames, i, dfAllSeq2, dfOutliers, distanceMatrix, DNABin, familySequences, iqr, lowerQuantile, upperQuantile, upperThreshold, dfRefSeq, familySequenceNames)
#Part 4: Create Maximum Likelihood tree----
#In this section, the data is saved to a fasta file for each family, distance matrices and neighbour joining trees are found, the best model and paramters are found for each family and maximum likelihood trees are generated. A bootstrap analysis is perfromed for each family and maxCladeCred consensus trees were found.
#Create function to convert DNAStringSets to dataframes
dna_string_to_df = function(dna_string_set){
out_df = as.data.frame(dna_string_set[[1]])
for(i in 2:length(dna_string_set)){
new_df = as.data.frame(dna_string_set[[i]])
out_df = rbind(out_df, new_df)
}
return(out_df)
}
#convert stringsets to dataframes
FamilyDNA = dna_string_to_df(dnaStringSet4)
#Add the bin_uri
FamilyDNA$bin_uri <- row.names(FamilyDNA)
#Bind list of dataframes into one dataframe
dfTaxaWO <- bind_rows(taxalistcompleteWO, .id = "column_label")
# Add column_label info to dfAllSeq.
dfAllSeqWO <- merge(dfAllSeq, dfTaxaWO[, c(1:2)], by = "bin_uri", all.y = T)
# Merge with the information for dfAllSeqWO.
dfFamilyDNA <- merge(FamilyDNA, dfAllSeqWO, by = "bin_uri", all.x = TRUE)
#Rename the coloumn with your aligned sequences
colnames(dfFamilyDNA)[2] <- "FinalSequences"
#create function to get reference names
get_reference_names <- function(top_ref_num = 15){
ref_names <- c("reference")
prefix <- "reference"
for(i in 1:top_ref_num){
new_str <- paste(prefix, as.character(i), sep='')
ref_names <- c(ref_names, new_str)
}
return(ref_names)
}
#create list of reference names
reference_names = get_reference_names()
#remove reference from dataframe
dfFamilyDNA <- dfFamilyDNA[!dfFamilyDNA$bin_uri %in% reference_names , ]
#Groups by column label
familyList <- lapply(na.omit(unique(dfFamilyDNA$column_label)),
function(x) dfFamilyDNA[dfFamilyDNA$column_label == x, ])
#Create function to remove NAs
RemoveAllNAs <- function(dfOriginal) {
#Function for removing rows that consist of entirely NA values.
#Reset the row numbers.
row.names(dfOriginal) <- NULL
# First check if there are any rows with all values missing.
test <- apply(dfOriginal, 1, function(x) all(is.na(x)))
if (any(test) == TRUE) {
#Identify the missing rows.
missing <- which(test == TRUE)
#Remove them from dfOriginal.
dfOriginal <- dfOriginal[-missing, ]
}
return(dfOriginal)
}
#Cleaning up familyList.
familyList <- lapply(familyList, RemoveAllNAs)
familyList <- lapply(familyList, function(x) {
noBin <- is.na(x$bin_uri)
x <- x[!noBin, ]
})
#Create new dnaStringSet
dnaStringSet5 <- sapply(familyList, function(x) DNAStringSet(x$FinalSequences))
#Pull BIN names from list
binNames <- sapply(familyList, function(x)(x$bin_uri))
#Name the stringsets
for(i in seq(from = 1, to = length(dnaStringSet5), by = 1)) {
names(dnaStringSet5[[i]]) <- binNames[[i]]
}
#Save family as a fasta file
#For file names make sure to list each family name
familyFileNames <- list("Carabidae", "Curculionidae", "Dytiscidae", "Coccinellidae", "Leiodidae", "Chrysomelidae", "Staphylinidae", "Buprestidae", "Hydrophilidae", "Haliplidae", "Cantharidae", "Gyrinidae", "Elateridae", "Cryptophagidae", "Scirtidae", "Latridiidae")
#Add alignment and .fas to each family name
familyFileNames <- foreach(i=1:length(familyFileNames)) %do%
paste("Alignment", familyFileNames[[i]], ".fas", sep="")
#Send to your desired working directory
foreach(i=1:length(dnaStringSet5)) %do% writeXStringSet(dnaStringSet5[[i]], file=familyFileNames[[i]], format="fasta")
#create a list of alignment files
#Calling the alignments in alphabetical order allows for easier analysis during the NRI/NTI step
list_of_files <- c("AlignmentBuprestidae.fas", "AlignmentCantharidae.fas", "AlignmentCarabidae.fas",
"AlignmentChrysomelidae.fas", "AlignmentCoccinellidae.fas", "AlignmentCryptophagidae.fas",
"AlignmentCurculionidae.fas", "AlignmentDytiscidae.fas", "AlignmentElateridae.fas",
"AlignmentGyrinidae.fas","AlignmentHaliplidae.fas", "AlignmentHydrophilidae.fas",
"AlignmentLatridiidae.fas", "AlignmentLeiodidae.fas", "AlignmentScirtidae.fas",
"AlignmentStaphylinidae.fas")
#read the alignments into phyDat format
phylo_dat <- lapply(list_of_files, function(x){
read.phyDat(x, format="fasta", type="DNA")
})
#create distance matrices
dm <- lapply(phylo_dat, function(x){
dist.ml(x)
})
#creating NJ tree
tree <- lapply(dm, function(x){
NJ(x)
})
#run model tests
model_tests <- lapply(phylo_dat, function(x){
modelTest(x)
})
#create environments
env <- lapply(model_tests, function(x){
attr(x, "env")
})
#create function to find best model for each family
get_best_model = function(model_df){
best_model = model_df['Model'][model_df['BIC'] == min(model_df['BIC']) ]
return(best_model)
}
#create a vector containing the best models
list_of_models = unlist(lapply(model_tests, function(x){
get_best_model(x)
}))
#get parameters for each model
model_fit <- lapply(env, function(x){
eval(get(list_of_models, x),x)
})
#create vector containing inv values
inv_values <- lapply(1:length(model_fit), function(i){
model_fit[[i]]$inv
})
#compute likelihood
ml_out = lapply(1:length(tree), function(i){
pml(tree[[i]], phylo_dat[[i]], k=4, inv = inv_values[[i]])
})
#drop the suffix from each of the model names
new_list_of_models = unlist(lapply(list_of_models , function(x){unlist(strsplit(x, "\\+"))[[1]]}))
#compute likelihood and optimize parameters
ml_families = lapply(1:length(ml_out), function(i){
optim.pml(ml_out[[i]], optNni = TRUE, optGamma = TRUE, optInv = TRUE, model = new_list_of_models[[i]])
})
#Do a bootstrap analysis for each family
bs <- lapply(1:length(ml_families), function(i){
bootstrap.pml(ml_families[[i]], bs=1000, optNni=TRUE, multicore = FALSE)
})
#Step to create a maxCladeCred consensus tree
bsTree <- lapply(1:length(bs), function(i){
maxCladeCred(bs[[i]])
})
#Reorder outgroup dataframe list so it is in the same order as the trees
l_outgroups <- l_outgroups[order(names(l_outgroups))]
#Root the trees using the outgroups
Tree_Rooted <- lapply(1:length(bsTree), function(i){
root(bsTree[[3]], outgroup = l_outgroups[[3]]$bin_uri, resolve.root = TRUE)
})
#Remove the outgroup
Tree_Final <- lapply(1:length(Tree_Rooted), function(i){
drop.tip(Tree_Rooted[[i]], l_outgroups[[i]]$bin_uri)
})
#Remove unneeded variables
rm(env, tree, model_tests, model_fit, dm, binNames, familyFileNames, familyList, dfFamilyDNA, dnaStringSet4, dnaStringSet5, FamilyDNA, ml_families, ml_out)
#Part 5: NTI and NRI----
#In this sections, family matrices are created and net relatedness indces(NRI) and nearest taxon indces (NTI) are calculated. NRI and NTI calculate the pairwise distance between two species and use this to estimate the community relatedness. NRI averages the evolutionary distances between all pairs of tips in the community, while NTI takes only the distances between nearest neighbours. A presence/absence matrix is created for each family in order to see how thier distribution relates to thier phylogenetic relationships.
#Create a filter for BINs found in Churchill
ChurchillFilter <- which(dfAllSeq$bin_uri %in% dfOrder_Churchill$bin_uri)
#Create a filter for the BINs not found in Churchill
NotChurchillFilter <- which(!(dfAllSeq$bin_uri %in% dfOrder_Churchill$bin_uri))
#Apply the filters
dfFilter_Churchill <- dfAllSeq[ChurchillFilter, ]
dfFilter_NotChurchill <- dfAllSeq[NotChurchillFilter, ]
#Change to data table and set to 1 if present in Churchill and 0 if not in Churchill
dfFilter_Churchill <- as.data.table(dfFilter_Churchill)
dfFilter_Churchill <- dfFilter_Churchill[, churchill := 1]
dfFilter_NotChurchill <- as.data.table(dfFilter_NotChurchill)
dfFilter_NotChurchill <- dfFilter_NotChurchill[, churchill := 0]
#Bind the new data frames to taxalistcomplete
dfAllSeq <- rbind(dfFilter_Churchill, dfFilter_NotChurchill)
#Create a presence absence matrix for bin_uri in Churchill
#Create new data frame
dfAllSeq2 <- dfAllSeq [, c("bin_uri", "churchill", "family_name")]
#Split into family dataframes
dfAllSeq2 <- split(dfAllSeq2, list(dfAllSeq$family_name))
#Remove the family name column
dfAllSeq2 <- lapply(dfAllSeq2, function(x){
x[,-3]
})
#Create family matrices
#create family matrices
Family_matrices <- lapply(dfAllSeq2, function(x){
melt(x, id.var="churchill")
})
Family_matrices <- lapply(Family_matrices, as.data.frame)
Family_matrices <- lapply(Family_matrices, function(x){
with(x, table(churchill, value))
})
Family_matrices <- lapply(Family_matrices, unclass)
#Calculate net relatedness index (NRI) and nearest taxon index (NTI) using ML Tree
#Ensure ML tree is in correct format
phy.dist <- lapply(Tree_Final, cophenetic)
#Calculate NRI
NRI_Results = lapply(1:length(phy.dist), function(i){
ses.mpd(Family_matrices[[i]], phy.dist[[i]], null.model = "taxa.labels", abundance.weighted = FALSE, runs = 1000)
})
#Calculate NTI
NTI_Results = lapply(1:length(phy.dist), function(i){
ses.mntd(Family_matrices[[i]], phy.dist[[i]], null.model = "taxa.labels", abundance.weighted = FALSE, runs = 1000)
})
#Remove unneeded variables
rm(phy.dist, dfFilter_Churchill, dfFilter_NotChurchill, ChurchillFilter, NotChurchillFilter, dfAllSeq2, Family_matrices)
#Part 6: Trait Analysis: ANOVA----
#In this section, character matrices are loaded in and ANOVAs are performed. The ANOVA will tell us whether the phylogenetic community structure is related to the biological traits of the families. A P-value of > 0.5 indicates that families with that trait are significantly phylogenetically clustered or overdispersed.
#Read in character matrix
Coleoptera_Matrix_NRI <- read_csv(file="C:/Users/Sam/Documents/Coleoptera_Matrix_NRI.csv")
#Run ANOVAs for all traits
Coleoptera_ANOVA_NRI_Feeding_Adult <- aov(structure ~ adult_diet, data = Coleoptera_Matrix_NRI)
Coleoptera_ANOVA_NRI_Feeding_Larval <- aov(structure ~ larval_diet, data = Coleoptera_Matrix_NRI)
Coleoptera_ANOVA_NRI_Habitat <- aov(structure ~ habitat, data = Coleoptera_Matrix_NRI)
#Get ANOVA summary
summary(Coleoptera_ANOVA_NRI_Habitat)
summary(Coleoptera_ANOVA_NRI_Feeding_Adult)
summary(Coleoptera_ANOVA_NRI_Feeding_Larval)
#Repeat for NTI values
#Read in matrix
Coleoptera_Matrix_NTI <- read.csv(file="C:/Users/Sam/Documents/Coleoptera_Matrix_NTI.csv")
#Run ANOVAs for both traits
Coleoptera_ANOVA_NTI_Habitat <- aov(structure ~ habitat, data = Coleoptera_Matrix_NTI)
Coleoptera_ANOVA_NTI_Feeding_Adult <- aov(structure ~ adult_diet, data = Coleoptera_Matrix_NTI)
Coleoptera_ANOVA_NTI_Feeding_Larval <- aov(structure ~ larval_diet, data = Coleoptera_Matrix_NTI)
#Get ANOVA summary
summary(Coleoptera_ANOVA_NTI_Habitat)
summary(Coleoptera_ANOVA_NTI_Feeding_Adult)
summary(Coleoptera_ANOVA_NTI_Feeding_Larval)
#Part 7: Trait Analysis: PGLS----
#In this section, the phylogenetic tree created based on the literature is read in and phylogenetic generalized least squares analyses are performed.
#The PGLS will tell us whether the phylogenetic community structure is related to the biological traits of the families, while taking into account the phylogenetic relationships. A P-value of > 0.5 indicates that families with that trait are significantly phylogenetically clustered or overdispersed.
#Read in matrix
PGLSdata_NRI <- read.csv("Coleoptera_Matrix_NRI.csv")
#Read in tree
PGLStree <- read.nexus("Coleoptera Tree 2020")
#Set branch lengths to one
PGLStree$edge.length <- replicate((length(PGLStree$edge[, 1])), 1)
PGLStree <- force.ultrametric(PGLStree, method="extend")
#Set the row names to family names
PGLSdata_NRI <- PGLSdata_NRI %>%
column_to_rownames(var = 'family_name')
#Make sure tree and dataframe are in the same order
PGLSdata_NRI <- PGLSdata_NRI[match(PGLStree$tip.label, rownames(PGLSdata_NRI)), ]
#Run PGLS analysis
pglsModel_NRI_Habitat <- gls(structure ~ habitat, correlation = corBrownian(phy = PGLStree), data = PGLSdata_NRI, method = "ML")
pglsModel_NRI_Adult_Diet <- gls(structure ~ adult_diet, correlation = corBrownian(phy = PGLStree), data = PGLSdata_NRI, method = "ML")
pglsModel_NRI_Larval_Diet <- gls(structure ~ larval_diet, correlation = corBrownian(phy = PGLStree), data = PGLSdata_NRI, method = "ML")
#Get PGLS summary
summary(pglsModel_NRI_Habitat)
summary(pglsModel_NRI_Adult_Diet)
summary(pglsModel_NRI_Larval_Diet)
#Create boxplots for traits vs. clustering matrix
plot_Habitat_NRI <- boxplot(PGLSdata_NRI$structure ~ PGLSdata_NRI$habitat)
plot_Adult_Diet_NRI <- boxplot(PGLSdata_NRI$structure ~ PGLSdata_NRI$adult_diet)
plot_Larval_Diet_NRI <- boxplot(PGLSdata_NRI$structure ~ PGLSdata_NRI$larval_diet)
#Repeat for NTI
PGLSdata_NTI <- read.csv("Coleoptera_Matrix_NTI.csv")
#Set row names to family names
PGLSdata_NTI <- PGLSdata_NTI %>%
column_to_rownames(var = 'family_name')
#Make sure tree and dataframe are in the same order
PGLSdata_NTI <- PGLSdata_NTI[match(PGLStree$tip.label, rownames(PGLSdata_NTI)), ]
#Run PGLS analysis
pglsModel_NTI_Habitat <- gls(structure ~ habitat, correlation = corBrownian(phy = PGLStree), data = PGLSdata_NTI, method = "ML")
pglsModel_NTI_Adult_Diet <- gls(structure ~ adult_diet, correlation = corBrownian(phy = PGLStree), data = PGLSdata_NTI, method = "ML")
pglsModel_NTI_Larval_Diet <- gls(structure ~ larval_diet, correlation = corBrownian(phy = PGLStree), data = PGLSdata_NTI, method = "ML")
#Get PGLS summary
summary(pglsModel_NTI_Habitat)
summary(pglsModel_NTI_Adult_Diet)
summary(pglsModel_NTI_Larval_Diet)
#Create boxplots for traits vs. clustering matrix
plot_Habitat_NTI <- boxplot(PGLSdata_NTI$structure ~ PGLSdata_NTI$habitat)
plot_Adult_Diet_NTI <- boxplot(PGLSdata_NTI$structure ~ PGLSdata_NTI$adult_diet)
plot_Larval_Diet_NTI <- boxplot(PGLSdata_NTI$structure ~ PGLSdata_NTI$larval_diet)