Load the libraries.
library(ape)
library(phangorn)
Load in the sequence data and the neighbour joining tree.
myalignment <- read.dna("ray2000_aligned.fas",format="fasta",as.matrix=TRUE)
mytree <- nj(dist.dna(myalignment,"TN93"))
Now compare different models.
myalignment.phydat <- as.phyDat(myalignment) # Convert to a format modeltest understand
myalignment.modeltest <- modelTest(myalignment.phydat,
tree=mytree,
model = c("JC", "F81", "K80", "HKY", "SYM", "GTR"),
G = TRUE,
I = TRUE,
k = 4,
control = pml.control(epsilon = 1e-08, maxit = 3, trace = 1),
multicore = FALSE)
## Warning in pml(tree, data): negative edges length changed to 0!
## [1] "JC+I"
## [1] "JC+G"
## [1] "JC+G+I"
## [1] "F81+I"
## [1] "F81+G"
## [1] "F81+G+I"
## [1] "K80+I"
## [1] "K80+G"
## [1] "K80+G+I"
## [1] "HKY+I"
## [1] "HKY+G"
## [1] "HKY+G+I"
## [1] "SYM+I"
## [1] "SYM+G"
## [1] "SYM+G+I"
## [1] "GTR+I"
## [1] "GTR+G"
## [1] "GTR+G+I"
myalignment.modeltest
## Model df logLik AIC BIC
## 1 JC 139 -9093.606 18465.21 19023.80
## 2 JC+I 140 -8618.051 17516.10 18078.71
## 3 JC+G 140 -8368.362 17016.72 17579.33
## 4 JC+G+I 141 -8351.732 16985.46 17552.09
## 5 F81 142 -9005.856 18295.71 18866.35
## 6 F81+I 143 -8525.408 17336.82 17911.47
## 7 F81+G 143 -8251.858 16789.72 17364.38
## 8 F81+G+I 144 -8238.720 16765.44 17344.12
## 9 K80 140 -8482.996 17245.99 17808.60
## 10 K80+I 141 -7980.468 16242.94 16809.56
## 11 K80+G 141 -7685.580 15653.16 16219.78
## 12 K80+G+I 142 -7667.595 15619.19 16189.83
## 13 HKY 143 -8440.617 17167.23 17741.89
## 14 HKY+I 144 -7916.114 16120.23 16698.91
## 15 HKY+G 144 -7636.388 15560.78 16139.45
## 16 HKY+G+I 145 -7616.750 15523.50 16106.20
## 17 SYM 144 -8445.450 17178.90 17757.58
## 18 SYM+I 145 -7943.376 16176.75 16759.45
## 19 SYM+G 145 -7649.702 15589.40 16172.10
## 20 SYM+G+I 146 -7631.733 15555.47 16142.18
## 21 GTR 147 -8436.124 17166.25 17756.98
## 22 GTR+I 148 -7903.426 16102.85 16697.60
## 23 GTR+G 148 -7622.485 15540.97 16135.72
## 24 GTR+G+I 149 -7601.719 15501.44 16100.21
Using R, we can look for the model e.g. with the lowest AIC.
myalignment.modeltest$Model[myalignment.modeltest$AIC==min(myalignment.modeltest$AIC)]
## [1] "GTR+G+I"
Now obtain a maximum likelihood tree, starting from the neighbour joining tree, and using the model chosen from modelTest
.
myalignment.pml <- pml(mytree,myalignment.phydat,model="GTR+I+G",k=4)
## Warning in pml(mytree, myalignment.phydat, model = "GTR+I+G", k = 4):
## negative edges length changed to 0!
# optimise
myalignment.pml <- optim.pml(myalignment.pml,optNni=TRUE,optBf=TRUE,optQ=TRUE,optInv=TRUE,optGamma=TRUE,optEdge=TRUE)
## optimize edge weights: -8617.809 --> -8415.953
## optimize base frequencies: -8415.953 --> -8306.962
## optimize rate matrix: -8306.962 --> -7739.162
## optimize invariant sites: -7739.162 --> -7662.088
## optimize shape parameter: -7662.088 --> -7662.088
## optimize edge weights: -7662.088 --> -7637.825
## optimize topology: -7637.825 --> -7621.82
## optimize topology: -7621.82 --> -7617.04
## optimize topology: -7617.04 --> -7606.44
## 20
## optimize base frequencies: -7606.44 --> -7590.913
## optimize rate matrix: -7590.913 --> -7579.069
## optimize invariant sites: -7579.069 --> -7579.068
## optimize shape parameter: -7579.068 --> -7578.92
## optimize edge weights: -7578.92 --> -7577.282
## optimize topology: -7577.282 --> -7565.306
## optimize topology: -7565.306 --> -7557.973
## optimize topology: -7557.973 --> -7554.848
## 15
## optimize base frequencies: -7554.848 --> -7551.863
## optimize rate matrix: -7551.863 --> -7550.22
## optimize invariant sites: -7550.22 --> -7550.168
## optimize shape parameter: -7550.168 --> -7550.117
## optimize edge weights: -7550.117 --> -7549.979
## optimize topology: -7549.979 --> -7542.122
## optimize topology: -7542.122 --> -7541.302
## optimize topology: -7541.302 --> -7541.302
## 6
## optimize base frequencies: -7541.302 --> -7540.721
## optimize rate matrix: -7540.721 --> -7540.306
## optimize invariant sites: -7540.306 --> -7540.295
## optimize shape parameter: -7540.295 --> -7540.288
## optimize edge weights: -7540.288 --> -7540.272
## optimize topology: -7540.272 --> -7540.272
## 0
## optimize base frequencies: -7540.272 --> -7540.091
## optimize rate matrix: -7540.091 --> -7539.995
## optimize invariant sites: -7539.995 --> -7539.994
## optimize shape parameter: -7539.994 --> -7539.992
## optimize edge weights: -7539.992 --> -7539.991
## optimize base frequencies: -7539.991 --> -7539.936
## optimize rate matrix: -7539.936 --> -7539.905
## optimize invariant sites: -7539.905 --> -7539.904
## optimize shape parameter: -7539.904 --> -7539.904
## optimize edge weights: -7539.904 --> -7539.904
## optimize base frequencies: -7539.904 --> -7539.887
## optimize rate matrix: -7539.887 --> -7539.878
## optimize invariant sites: -7539.878 --> -7539.878
## optimize shape parameter: -7539.878 --> -7539.878
## optimize edge weights: -7539.878 --> -7539.877
## optimize base frequencies: -7539.877 --> -7539.872
## optimize rate matrix: -7539.872 --> -7539.869
## optimize invariant sites: -7539.869 --> -7539.869
## optimize shape parameter: -7539.869 --> -7539.869
## optimize edge weights: -7539.869 --> -7539.869
## optimize base frequencies: -7539.869 --> -7539.868
## optimize rate matrix: -7539.868 --> -7539.867
## optimize invariant sites: -7539.867 --> -7539.867
## optimize shape parameter: -7539.867 --> -7539.867
## optimize edge weights: -7539.867 --> -7539.867
## optimize base frequencies: -7539.867 --> -7539.867
## optimize rate matrix: -7539.867 --> -7539.866
## optimize invariant sites: -7539.866 --> -7539.866
## optimize shape parameter: -7539.866 --> -7539.866
## optimize edge weights: -7539.866 --> -7539.866
## optimize base frequencies: -7539.866 --> -7539.866
## optimize rate matrix: -7539.866 --> -7539.866
## optimize invariant sites: -7539.866 --> -7539.866
## optimize shape parameter: -7539.866 --> -7539.866
## optimize edge weights: -7539.866 --> -7539.866
## optimize base frequencies: -7539.866 --> -7539.866
## optimize rate matrix: -7539.866 --> -7539.866
## optimize invariant sites: -7539.866 --> -7539.866
## optimize shape parameter: -7539.866 --> -7539.866
## optimize edge weights: -7539.866 --> -7539.866
# display
myalignment.pml
##
## loglikelihood: -7539.866
##
## unconstrained loglikelihood: -1990.566
## Proportion of invariant sites: 0.270931
## Discrete gamma model
## Number of rate categories: 4
## Shape parameter: 0.905875
##
## Rate matrix:
## a c g t
## a 0.000000 1.3729288 8.0371277 1.684299
## c 1.372929 0.0000000 0.7245706 11.250734
## g 8.037128 0.7245706 0.0000000 1.000000
## t 1.684299 11.2507341 1.0000000 0.000000
##
## Base frequencies:
## 0.1764514 0.3412765 0.2521093 0.2301627
This shows the maximum likelihood parameter values. The ML tree is contained in the fit as well.
myalignment.pml$tree
##
## Phylogenetic tree with 71 tips and 69 internal nodes.
##
## Tip labels:
## AF271819, AF271820, AF271821, AF271823, AF271824, AF271822, ...
##
## Unrooted; includes branch lengths.
We can compare our original tree and our new tree numerically using treedist
.
treedist(myalignment.pml$tree,mytree)
## symmetric.difference branch.score.difference
## 56.000000 1.024206
## path.difference quadratic.path.difference
## 153.701008 42.384989
This isn't particularly helpful. A comparison of edge lengths is quite striking.
sum(myalignment.pml$tree$edge.length)
## [1] 7.13001
sum(mytree$edge.length)
## [1] 3.115848
Now we can perform a bootstrap, starting with our 'best' maximum likelihood tree, and performing nearest neighbour interchanges.
myalignment.pml.bs <- bootstrap.pml(myalignment.pml,bs=100,trees=TRUE,optNni=TRUE)
As with the neighbour-joining tree, we can overlay bootstrap supports on our maximum likelihood tree.
plotBS(myalignment.pml$tree,myalignment.pml.bs,type="phylogram",cex=0.5)