forked from tbegum/Testing_the_ortholog_conjecture
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathManuscript.Rmd
1513 lines (1099 loc) · 110 KB
/
Manuscript.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title:
author: Tina & Marc
date: "`r Sys.Date()`"
output:
pdf_document: default
html_notebook:
fig_caption: yes
keep_tex: yes
highlight: espresso
number_sections: no
html_document:
df_print: paged
word_document:
reference_docx: Manuscript.docx
urlcolor: blue
fontsize: 10pt
geometry: margin=1in
editor_options:
chunk_output_type: console
header-includes:
- \usepackage{caption}
- \captionsetup[figure]{labelformat=empty}
- \usepackage{float}
- \usepackage{color}
linestretch: 1
bibliography: TM.bib
csl: plos.csl
---
<style>
body {
text-align: justify}
</style>
```{r global_options, include=FALSE}
knitr::opts_chunk$set(fig.pos = 'H')
```
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE,message=FALSE, warning=FALSE, dev="png",dpi = 400,cache = TRUE)
```
```{r Initial settings, echo=F, message=F, warning=F}
# The following libraries should be installed from github with the specified
# devtools command. One needs to install devtools first.
library(treeio)#devtools::install_github("GuangchuangYu/treeio")
library(ggtree)#devtools::install_github("GuangchuangYu/ggtree")
library(easyGgplot2) #install_github("kassambara/easyGgplot2")
# The following libraries can be installed from CRAN by using
#install.packages("packagename")
library(geiger)
library(ape)
library(phytools)
library(stringr)
library(ggrepel)
library(dplyr)
library(ggplot2)
library(gridExtra)
library(parallel)
library(digest)
library(magrittr)
library(gtools)
library(tidyverse)
library(caper)
library(foreach)
library(doParallel)
library(cowplot)
library(png)
library(grid)
## Setting the working directory
set.seed(23456)
## Renamed original function file provided by Dunn et al. (doi:10.1073/pnas.1707515115)
source("functions_Dunn.R")
## Additional functions written for this study
source( "function_TM_new.R" )
## Setting system computational parameter
cores = detectCores()
```
\vspace{0.5cm}
# **Phylogenetic comparative methods are problematic when applied to gene trees with speciation and duplication nodes: correcting for biases in testing the ortholog conjecture**
\vspace{0.5cm}
Tina Begum^1,2^, Marc Robinson-Rechavi^1,2^*
\vspace{0.5cm}
^1^Department of Ecology and Evolution, University of Lausanne, Lausanne, Switzerland<br />
^2^SIB Swiss Institute of Bioinformatics, Lausanne, Switzerland <br />
\vspace{0.5cm}
\*Corresponding author
E-mail: [email protected]
### Abstract
Most comparative studies of functional genomics have used pairwise comparisons. Yet it has been shown that this can provide biased results, since genes, like species, are phylogenetically related. Phylogenetic comparative methods should allow to correct for this, but they depend on strong assumptions, including unbiased tree estimates relative to the hypothesis being tested. Such methods have recently been used to test the "Ortholog Conjecture", the hypothesis that functional evolution is faster in paralogs than orthologs. Whereas pairwise comparisons of tissue specificity index ($\tau$) provided support for the ortholog conjecture, it was not supported using phylogenetic independent contrasts. We find that the gene trees used suffer from important biases, due to the inclusion of trees with no duplication nodes, to the relative age of speciations and duplications, to systematic differences in branch lengths, and to non-Brownian motion of tissue-specificity on many trees. We find some support for the ortholog conjecture, but especially that incorrect implementation of phylogenetic method in empirical gene trees with duplications can be problematic.
### Author Summary
There is a lot of interest in understanding the evolution of gene function. Comparing functional genomics results offers a way to do this. In most cases, it is done by comparing pairs of genes, and notably pairs of orthologs (homologs derived by speciation) and pairs of paralogs (homologs derived by duplication). A drawback of such pairwise comparisons is that they neglect the evolutionary history of the genes, and thus violate the statistical presumption of independence of observations. Phylogenetic comparative methods have been suggested to correct for this. Recent results indicate that whereas pairwise comparisons support stronger functional divergence of paralogs than of orthologs, phylogenetic analyses would not. Re-analyses of these data show that gene trees are biased in such a way as to cause biases in phylogenetic methods when there are gene duplication. This can led to erroneous results, and have serious consequences in biological data interpretation.
\pagebreak
### Introduction
The "Ortholog Conjecture", a cornerstone of phylogenomics, has become a topic of debate in recent years [@RN771;@RN774;@RN790;@RN793;@RN794;@RN762;@RN760;@RN770;@RN792]. Dealing with the roles of gene duplications in functional evolution, the ortholog conjecture is routinely used by both experimental and computational biologists in predicting, or understanding gene function. According to this model, orthologs (i.e. homologous genes which diverged by a speciation event) retain equivalent or very similar functions, whereas paralogs (i.e. homologous genes which diverged by a duplication event) share less similar functions [@RN771]. This is linked to the hypothesis that paralogs evolve more rapidly. This hypothesis was challenged by results suggesting that paralogs would be functionally more similar than orthologs [@RN774]. Such findings not only raised questions on the evolutionary role of gene duplication but also questioned the reliability of using orthologs to annotate unknown gene functions in different species [@RN844]. Several more recent studies [@RN770;@RN790;@RN793;@RN794;@RN762] found support for ortholog conjecture, mostly based on comparisons of gene expression data.
While all previous studies of the ortholog conjecture had used pairwise comparisons of orthologs and paralogs, a recent article suggested that this was flawed, and that phylogenetic comparative methods should be used [@RN760]. In the field of comparative biology, most of the studies still rely on the pairwise method including the testing of the ortholog conjecture. Ignorance of phylogenetic structure has been reported to underestimate the fundamental assumption of independent observations in statistics [@RN773;@RN807]. A solution is to use phylogeny-based methods to investigate such evolutionary patterns [@RN782;@RN807;@RN779;@RN780;@RN773;@RN850;@RN849]. There are three main such phylogenetic methods: Phylogenetic Independent Contrast (PIC) [@RN773], Phylogenetic Generalized Least-Square (PGLS) [@RN850], and Monte Carlo computer simulation [@RN849]. PIC is widely adopted for its relative simplicity, and its applicability to a wide range of statistical procedures [@RN781;@RN760]. The performance of PIC relies on three basic assumptions: a correct tree topology; accurate branch lengths; and trait evolution following a Brownian model (where trait variance accrues as a linear function of time) [@RN781;@RN782;@RN783;@RN807;@RN780;@RN779;@RN816]. If any of these assumptions is incorrect, this can lead to incorrect interpretation of results. This is probably why the application of such phylogenetic methods is still limited, and debated even after being introduced about three decades ago [@RN781;@RN782;@RN783;@RN807;@RN780]. Dunn et al. [@RN760] took an innovative approach in applying PIC to compare the divergence rates between two different events ("speciation" and "duplication") to test the ortholog conjecture. However, such an application might be problematic since the time of occurrence of gene duplication, one of the two types of events compared, is unknowable by external information (e.g. no fossil evidence). Therefore, further study is required to understand why Dunn et al. [@RN760] obtained results which are inconsistent with most studies by using the phylogenetic method. It is possible that all the conclusions drawn by previous studies on gene duplication are incorrect due to overlooking phylogenetic tree structure. If so, it should be well supported.
We re-examined the data of Dunn et al., after reproducing it using the resources and scripts provided by the authors [@RN760]. Our reanalysis highlights potential problems associated with phylogenetic independent contrasts when applied to the impact of gene duplication. Finally, with proper controls, the phylogenetic method supports the ortholog conjecture.
### Results
```{r load_Dunn_data, echo=F, cache=T, message=F, warning=F}
## Loaded the analysis results generated by using files and scripts ("manuscript_kernel.R") of Dunn et al.
## Files and scripts are used from the link https://github.com/caseywdunn/comparative_expression_2017
## We provided the following link https://doi.org/10.5281/zenodo.3354285 to make available the reproduced results of Dunn et al. to save time
load("manuscript_dunn.RData")
## Obtaining calibrated trees with no duplication event
gene_trees_with_duplication1<-mclapply(gene_trees_calibrated,tree_with_duplication, mc.cores = cores)
gene_trees_with_duplication<-gene_trees_with_duplication1[!is.na(gene_trees_with_duplication1)]
trees_with_no_duplication <- length(gene_trees_calibrated) - length(gene_trees_with_duplication)
```
__Issues with naive use of Phylogenetic Independent Contrasts (PICs)__
```{r reanalyses_Dunn_data,echo=F, message=F, warning=F}
## Dunn et al. replaced the actual Tau values with simulated Tau values (for null and OC simulation) on the calibrated trees
## and returned as treeio::treedata objects, and we used those data for our reanalyses
## Creating dataframes of null and OC (Ortholog Conjecture) simulation data using columns of events and PICs
null_data.dunn <- NULL
OC_data.dunn <- NULL
null_data.dunn <- na.omit(nodes_sim_null_contrast[c(7,21)])
null_data.dunn$label <- "Null simulation"
OC_data.dunn <- na.omit(nodes_sim_oc_contrast[c(7,21)])
OC_data.dunn$label <- "OC simulation"
## Retrieving speciation and duplication PICs of null and OC simulation data of Dunn et al.
speciation_null <- null_data.dunn$pic[which(null_data.dunn$Event=="Speciation")]
duplication_null <- null_data.dunn$pic[which(null_data.dunn$Event=="Duplication")]
speciation_oc <- OC_data.dunn$pic[which(OC_data.dunn$Event=="Speciation")]
duplication_oc <- OC_data.dunn$pic[which(OC_data.dunn$Event=="Duplication")]
## Performing Wilcoxon tests on null simulation data
wilcox_one_tailed1 <- wilcox.test(duplication_null,speciation_null,alternative = "greater")$p.value
wilcox_one_tailed2 <- wilcox.test(speciation_null,duplication_null,alternative = "greater")$p.value
wilcox_two_tailed_null <- two_tailed_wilcox(speciation_null,duplication_null)
## Performing two sided Wilcoxon test on OC simulation data
wilcox_two_tailed_oc <- two_tailed_wilcox(speciation_oc,duplication_oc)
```
To understand their results, we first reanalyzed the data of Dunn et al. [@RN760]. We were able to reproduce all the results published in their article by running the code, which they clearly supplied. Dunn et al. reported a non-significant result for the PIC under the null simulation, using a Wilcoxon one-tailed rank test to check if the contrasts of duplication events are higher than the contrasts of speciation events (*P* = $`r format(wilcox_one_tailed1, digits= 3, scientific = FALSE)`$). Surprisingly, the PIC rejects the null hypothesis on the null simulations with a Wilcoxon two-tailed rank test (Fig 1A), with significant support for higher contrasts after speciation than duplication. This was robust to repeating the simulations with different random seed number (S1 Fig). This indicates that neither of the methods, PIC or pairwise, worked properly for these calibrated trees, since both rejects the null when simulations are performed under null.
PIC is standardized by the expected variance of daughter branch lengths for each node of event [@RN807;@RN782;@RN779;@RN780]. Therefore, accurate branch length estimation in absolute time (e.g. in Million Years -- My) is needed [@RN781;@RN782;@RN807;@RN779;@RN780]. Since the accuracy and performance of the PIC method largely depend on proper branch length calibration [@RN779;@RN768;@RN782] we investigated possible biases created during calibration of gene trees, especially concerning duplication times for which we have no external reference (e.g. no fossils) (S2 Fig).
```{r phylogenetic_analyses_trees_with_strong_signals, eval=T, echo=F, message=F, warning=F}
## Identifying trees with strong phylogenetic signals using the criteria (Blomberg's K > 0.551) as specificed by Dunn et al. in their paper
## We make a vector to collect calibrated tree index with Blomberg's K > 0.551
tree_index <- as.numeric(row.names(tree_summary)[(!(is.na(tree_summary$K)) & (tree_summary$K > 0.551))])
trees_new <- gene_trees_calibrated[c(tree_index)] ## List of trees with strong phylogenetic signals
subset.trees <- length(trees_new)
## Counting trees with low phylogenetic signals
count_trees <- (length(gene_trees_calibrated)-length(trees_new))/length(gene_trees_calibrated)
count_trees <- paste(round(count_trees*100, 2), "%", sep="")
## Collecting null simulation trees associated with strong phylogenetic signals
trees_new_sim_null <- gene_trees_sim_null[c(tree_index)]
## Making a dataframe by combining data slots of all these trees
new_sim_null_contrast <- bind_rows(lapply(trees_new_sim_null,summary_function))
## Collecting OC simulation trees associated with strong phylogenetic signals to make a dataframe with combined data slots for all trees
trees_new_sim_oc <- gene_trees_sim_oc[c(tree_index)]
new_sim_oc_contrast <- bind_rows(lapply(trees_new_sim_oc,summary_function))
## Extracting the columns of event and PIC data from null and OC simulation result of trees with strong phylogenetic signals
null_data.new <- na.omit(new_sim_null_contrast[c(6,20)])
null_data.new$label <- "Null simulation"
OC_data.new <- na.omit(new_sim_oc_contrast[c(6,20)])
OC_data.new$label <- "OC simulation"
## Obtaining speciation and duplication PICs from null and OC simulation of trees with strong phylogenetic signals
speciation_null_new <- null_data.new$pic[which(null_data.new$Event=="Speciation")]
duplication_null_new <- null_data.new$pic[which(null_data.new$Event=="Duplication")]
speciation_oc_new <- OC_data.new$pic[which(OC_data.new$Event=="Speciation")]
duplication_oc_new <- OC_data.new$pic[which(OC_data.new$Event=="Duplication")]
## Performing Wilcoxon tests on null and OC simulation data of trees with strong phylogenetic signals
wilcox_1_tailed1_new <- one_tailed_wilcox(duplication_null_new,speciation_null_new)
wilcox_1_tailed2_new <- one_tailed_wilcox(speciation_null_new,duplication_null_new)
wilcox_2_tailed_null_new <- two_tailed_wilcox(speciation_null_new,duplication_null_new)
wilcox_2_tailed_oc_new <- two_tailed_wilcox(speciation_oc_new,duplication_oc_new)
```
```{r phylogenetic_signal,eval=T, echo=F, message=F, warning=F}
## Identifying time calibrated trees with negative branch lengths
trees_negative <- negative_edgelength(gene_trees_calibrated)
## List of calibrated trees without negative branch lengths
gene_trees_pic1 <- gene_trees_pic[-trees_negative]
## Computing phylogenetic signal for list of trees without negative branch lengths
## Trees for which phylogenetic signal estimation failed, we assigned a random value 1000 to distinguish it from actual estimated values
signal_tree <- bind_rows(lapply(gene_trees_pic1,Phylogenetic_signal))
signal_tree$tree_number <- as.numeric(rownames(signal_tree))
## Dataframe with Blomberg's K
signal_tree_new <- signal_tree[which(signal_tree$K_effecient<1000),]
phylosig_blomberg <- signal_tree_new[which(signal_tree_new$Pvalue_Blomberg<0.05),]
phylosig_blomberg <- phylosig_blomberg[c(1:3,6)]
## Dataframe with Pagel's lamba
signal_tree_new2 <- signal_tree[which(signal_tree$lambda_estimate<1000),]
phylosig_pagel <- signal_tree_new2[which(signal_tree_new2$Pvalue_pagel<0.05),]
phylosig_pagel <- phylosig_pagel[c(1,4:6)]
```
The purpose of applying a phylogenetic method is to generate independent statistical data points for traits, which are otherwise not independent enough because of their phylogenetic relatedness. Such statistical non-independence among species trait values can be measured by "phylogenetic signal" [@RN819; @RN768; @RN800; @RN820;@RN826]. Blomberg's K and Pagel's $\lambda$ are widely used methods to capture such phylogenetic signal [@RN819;@RN768]. Dunn et al. [@RN760] used Blomberg's K in their study. In case of Blomberg's K the value ranges from 0 to $\infty$, where a value of 0 indicates no phylogenetic signal, a value of 1 indicates trait evolution according to a Brownian model (BM), values between 0 and 1 indicate that distant relatives are more similar than expected under BM, and values larger than 1 indicate that close relatives are more similar than expected under BM [@RN819; @RN768; @RN800; @RN820;@RN826]. With a cutoff of K > 0.551, Dunn et al. [@RN760] obtained only `r subset.trees` out of `r length(gene_trees_calibrated)` calibrated trees with strong phylogenetic signal. Using a cut-off of *P* < 0.05 together with K > 0.551 leads to 1135 trees, which produce similar results (S3 Fig) to that set of `r subset.trees` trees. We continued analyses with the `r subset.trees` trees of Dunn et al. [@RN760] for consistency. In any case, it is notable that trait ($\tau$) values are independent of phylogeny for the majority of the gene trees. For these `r (length(gene_trees_calibrated)-subset.trees)` trees, implementation of PIC to produce statistically independent data points might not be necessary. Moreover, it can be misleading if the contrasts are not checked for adequate standardization as per BM after applying PIC (discussed later). The phylogenetic method still rejects the null hypothesis under null simulations for those `r subset.trees` trees (Fig 1B), showing that the problem is not simply due to low phylogenetic signal.
```{r Fig1, dpi=400,fig.width=8, fig.height=3.5, fig.cap="**Fig 1**: Reanalyses of phylogenetic simulation data of Dunn et al. [@RN760]. *P* values are from Wilcoxon two-tailed tests. In null simulations, there should be no difference in contrasts between events. In OC (Ortholog Conjecture) simulations, contrasts are expected to be higher for duplication than for speciation. (A) An excess of contrasts for speciation than duplication reject the null hypothesis under null simulation scenario for all empirical time calibrated gene trees. (B) Results are similar with a subset of trees with strong phylogenetic signal for $\\tau$."}
## To make the plot, we made a single dataframe of both simulation data of all calibrated gene trees
simulation.df <- rbind(null_data.dunn, OC_data.dunn)
all_trees <- paste0("A. Plots on ",length(gene_trees_calibrated), " trees", sep="")
## Dataframe is built based on 2082 trees with strong phylogenetic signals
simulation.new <- rbind(null_data.new, OC_data.new)
subset_trees <- paste0("B. Plots on ",subset.trees, " trees", sep="")
## Generating plot on simulation data for all 8520 calibrated trees
dodge <- position_dodge(width = 0.51)
plotA <- ggplot(simulation.df,aes(x=label, y=pic, fill=Event)) +
geom_boxplot(width=0.5,outlier.colour=NA, position=dodge, notch=T) +
xlab(NULL) +
ylab(expression(bold("Phylogenetic Independent Contrasts (PICs)"))) +
coord_cartesian(ylim=c(0, 0.05)) +
theme_classic()+
theme(legend.title=element_blank(),legend.position=c(0.9,0.9)) +
theme(legend.position="none")+
theme(axis.text.y = element_text(size=10)) +
theme(axis.text.x = element_text(size=10,face="bold")) +
ggtitle(paste0(all_trees))+
theme(plot.title = element_text(face = "bold"))+
annotate("text", x=1.04, y=0.035, label=wilcox_two_tailed_null, fontface=4) +
annotate("text", x=1.98, y=0.035, label=wilcox_two_tailed_oc, fontface=4)
## Generating plot on simulation data for all 2082 calibrated trees
plotB <- ggplot(simulation.new,aes(x=label, y=pic, fill=Event)) +
geom_boxplot(width=0.5,outlier.colour=NA, position=dodge, notch=T) +
xlab(NULL) +
ylab(NULL) +
coord_cartesian(ylim=c(0, 0.05)) +
theme_classic()+
theme(legend.title=element_blank(),legend.position=c(0.9,0.9),legend.text=element_text(size=8)) +
theme(axis.text.y= element_text(size=10)) +
theme(axis.text.x = element_text(size=10,face="bold"))+
ggtitle(paste0(subset_trees))+
theme(plot.title = element_text(face = "bold"))+
annotate("text", x = 1.04, y = 0.035, label= wilcox_2_tailed_null_new, fontface=4) +
annotate("text", x = 1.98, y = 0.035, label= wilcox_2_tailed_oc_new, fontface=4)
grid.arrange(plotA,plotB,ncol = 2)
```
```{r Detailed_analyses_on_empirical_data, echo=F, message=F, warning=F}
## Obtaining calibration time points for speciation and duplication events for empirical data
speciation.empirical <- unique(round(as.integer(nodes_contrast$node_age[which(nodes_contrast$Event == "Speciation")]), 0))
duplication.empirical <- unique(round(as.integer(nodes_contrast$node_age[which(nodes_contrast$Event == "Duplication")]), 0))
## Expected variance of events
## We used a cutoff of pic < 0.5 to get rid of branch length underestimation bias as used by Dunn et al.
Exp_var.speciation <- nodes_contrast$var_exp[nodes_contrast$Event=="Speciation" & nodes_contrast$pic < 0.5]
Exp_var.duplication <- nodes_contrast$var_exp[nodes_contrast$Event=="Duplication" & nodes_contrast$pic < 0.5]
Exp_var.duplication.precede.speciation <- nodes_contrast$var_exp[nodes_contrast$Event=="Duplication" & nodes_contrast$node_age > 296 & nodes_contrast$pic < 0.5] # Expected variance of duplication events preceding oldest speciation event of the trees
## Performing test on expected variance
Exp_var.pvalue <- wilcox.test(Exp_var.duplication.precede.speciation,Exp_var.speciation,alternative="two.sided")$p.value
if(Exp_var.pvalue == 0) Exp_var.pvalue1 <- 2.2e-16
if(Exp_var.pvalue != 0) Exp_var.pvalue1 <- Exp_var.pvalue
## Exptracting data of trees with strong phylogenetic signals for expected variance analyses
trees_new_pic <- gene_trees_pic[c(tree_index)]
trees_new_contrast <-bind_rows(lapply(trees_new_pic,summary_function))
trees_new_contrast <-trees_new_contrast[which(!is.na(trees_new_contrast$var_exp)),]
Exp_var.speciation_subset <- na.omit(trees_new_contrast$var_exp[trees_new_contrast$Event=="Speciation" & trees_new_contrast$pic < 0.5])## all speciation events
Exp_var.duplication_subset <- na.omit(trees_new_contrast$var_exp[trees_new_contrast$Event=="Duplication" & trees_new_contrast$pic < 0.5]) ## all duplication events
## Making a dataframe of expected variance for trees with strong phylogenetic signals
check_df <- data.frame(exp_var=NA,Event=NA)
check_df <- rbind(check_df, data.frame(exp_var=Exp_var.speciation_subset,Event="Speciation"))
check_df <- rbind(check_df, data.frame(exp_var=Exp_var.duplication_subset,Event="Duplication"))
check_df <- check_df[-1,]
check_df$Event <- factor(check_df$Event, levels=c("Speciation", "Duplication"))
## Making a dataframe of expected variance distribution of all 8520 calibrated trees
check_df1 <- data.frame(exp_var=NA,Event=NA)
check_df1 <- rbind(check_df1, data.frame(exp_var=Exp_var.speciation,Event="Speciation"))
check_df1 <- rbind(check_df1, data.frame(exp_var=Exp_var.duplication,Event="Duplication"))
check_df1 <- check_df1[-1,]
check_df1$Event <- factor(check_df1$Event, levels=c("Speciation", "Duplication"))
```
Dunn et al. [@RN760] used `r nrow(calibration_times_focal)` speciation time points to calibrate gene trees. The oldest speciation calibration was at `r max(calibration_times_focal$age)` My. All other calibrated nodes were duplication nodes leading to obtain ~`r length(duplication.empirical)` unique duplication time points for the `r length(gene_trees_calibrated)` calibrated trees [@RN760]. Out of these, `r length(duplication.empirical[duplication.empirical > max(speciation.empirical)])` time points preceded the oldest speciation node age. Surprisingly, the calibrated node age of the oldest duplication event was found to be `r as.integer(max(duplication.empirical))` My, that is, 2600 times older than the Earth. This is indicative of the difficulty of estimating the age of ancient duplications by phylogenetic methods. Those `r length(duplication.empirical[duplication.empirical > max(speciation.empirical)])` high duplication node ages eventually led to much larger expected variances for gene duplication events (median expected variance for duplication events preceding the oldest speciation events: `r round(median(Exp_var.duplication.precede.speciation),0)`, median expected variance for speciation events: `r round(median(Exp_var.speciation),0)`). This explains why the mean paralog distance was ~`r round((paralog_distance_mean/ortholog_distance_mean),1)` times higher than that of the mean ortholog distance (S4A Fig). This data distribution also makes it appear as if no speciation happened before `r max(speciation.empirical)` My, a problem shared with the pairwise analysis of the same data [@RN762]. There are also branch length issues for duplication events younger than the oldest speciation events, including very short branches and negative branch lengths. All this led to obtain abnormally high duplication contrasts for the empirical data. To limit such problems, Dunn et al. [@RN760] removed node contrasts higher than 0.5 on the empirical data (this does not impact Fig 1, as the simulation data never has such high contrasts). Following this practice, there are still higher expected variances following gene duplication events (S4A and S4B Figs). In the null simulations only the $\tau$ values are simulated, while the branch lengths (hence the variances) are taken from the empirical data, and thus share its biases. This explains why contrasts are lower for duplications than for speciations under null simulations as well as with empirical data.
__Randomization tests to assess the performance of phylogenetic method__
```{r randomization_tau_trees, eval=T, echo=F, message=F, warning=F}
## Since our dataset contains many speciation trees with no duplication, this may bias the result.
## So we identified calibrated trees with atleast one duplication events
trees_of_interest <- mclapply(gene_trees_pic1,tree_with_duplication, mc.cores = cores)
trees_of_interest <- trees_of_interest[!is.na(trees_of_interest)]
## All calibrated trees with permuted actual Tau of the tips
Tau_randomized_tree <- mclapply(gene_trees_pic,shuffling_tau,mc.cores = cores)
## Permuted Tau for calibrated trees with atleast one duplication event
Tau_randomized_filtered_tree <- mclapply(trees_of_interest,shuffling_tau,mc.cores = cores)
## Calculated PIC based on randomized Tau for both list of trees
Tree_pic_randomized <- mclapply(Tau_randomized_tree,contrast_random,mc.cores = cores)
Filtered_tree_pic_randomized_tau <- mclapply(Tau_randomized_filtered_tree,contrast_random,mc.cores = cores)
## Summarizing data of all trees in single dataframe, and removing rows for which contrasts are not available (e.g. for tips)
summary_tree_pic_randomized <- bind_rows(lapply(Tree_pic_randomized,summary_function))
summary_filtered_tree_pic_randomized_tau <- bind_rows(lapply(Filtered_tree_pic_randomized_tau,summary_function))
Random_tau_contrast <- summary_tree_pic_randomized[which(!is.na(summary_tree_pic_randomized$pic_abs_random)),]
Random_tau_contrast_filtered_tree <- summary_filtered_tree_pic_randomized_tau[which(!is.na(summary_filtered_tree_pic_randomized_tau$pic_abs_random)),]
Random_tau_contrast <- Random_tau_contrast[which(Random_tau_contrast$pic_abs_random < 0.5),]
Random_tau_contrast_filtered_tree <- Random_tau_contrast_filtered_tree[which(Random_tau_contrast_filtered_tree$pic_abs_random < 0.5),]
## Removing rows with NA event, and collecting all nodes contrast
nodes_contrast_random <- Random_tau_contrast[!is.na(Random_tau_contrast$D),]
nodes_contrast_random_tau_filtered_tree <- Random_tau_contrast_filtered_tree[!is.na(Random_tau_contrast_filtered_tree$D),]
nodes_contrast_random$Event <- factor(nodes_contrast_random$Event, levels=c("Speciation", "Duplication"))
nodes_contrast_random_tau_filtered_tree$Event <- factor(nodes_contrast_random_tau_filtered_tree$Event, levels=c("Speciation", "Duplication"))
## Collecting PICs and expected variances of events after randomizing actual Tau
speciation_pic.r <- nodes_contrast_random$pic_abs_random[which(nodes_contrast_random$Event=="Speciation")]
duplication_pic.r <- nodes_contrast_random$pic_abs_random[which(nodes_contrast_random$Event=="Duplication")]
speciation_pic.var <- nodes_contrast_random$Variance[which(nodes_contrast_random$Event=="Speciation")]
duplication_pic.var <- nodes_contrast_random$Variance[which(nodes_contrast_random$Event=="Duplication")]
## Performing test
wilcox_1_tailed.r <- one_tailed_wilcox(duplication_pic.r,speciation_pic.r)
wilcox_2_tailed.r <- two_tailed_wilcox(duplication_pic.r,speciation_pic.r)
variance_difference<- wilcox.test(duplication_pic.var,speciation_pic.var, alternative = "greater")$p.value
if(variance_difference == 0) variance_diff <- 2.2e-16
if(variance_difference!= 0) variance_diff <- variance_difference
## Median PICs of randomized Tau of the two events of all calibrated trees
median_nodes_contrast_random <- NULL
median_nodes_contrast_random <- aggregate(pic_abs_random~Event, nodes_contrast_random, median)
median_nodes_contrast_random$pic.round <- paste0("Median = ",round(median_nodes_contrast_random$pic_abs_random,4),sep="")
## Median PICs of randomized Tau of the two events of trees with atleast one duplication
median_nodes_contrast_random_tau_ftree <- NULL
median_nodes_contrast_random_tau_ftree <- aggregate(pic_abs_random~Event, nodes_contrast_random_tau_filtered_tree, median)
median_nodes_contrast_random_tau_ftree$pic.round <- paste0("Median = ",round(median_nodes_contrast_random_tau_ftree$pic_abs_random,4),sep="")
```
```{r randomization_events_trees, eval=T, echo=F, message=F, warning=F}
## All calibrated trees with permuted actual internal node events
Event_randomized_pic_tree <- mclapply(gene_trees_pic, shuffling_event, mc.cores = cores)
## Permuting internal node events for trees with atleast one duplication event
Event_randomized_filtered_tree <- mclapply(trees_of_interest,shuffling_event,mc.cores = cores)
##Summarizing data of all list of trees in a dataframe
summary_event_randomized_pic_tree <- bind_rows(lapply(Event_randomized_pic_tree,summary_function))
summary_filtered_tree_pic_randomized_event <- bind_rows(lapply(Event_randomized_filtered_tree,summary_function))
Random_event_contrast <- summary_event_randomized_pic_tree[which(!is.na(summary_event_randomized_pic_tree$pic)),]
Random_event_contrast_filtered_tree <- summary_filtered_tree_pic_randomized_event[which(!is.na(summary_filtered_tree_pic_randomized_event$pic)),]
Random_event_contrast <- Random_event_contrast[which(Random_event_contrast$pic < 0.5),]
Random_event_contrast_filtered_tree <- Random_event_contrast_filtered_tree[which(Random_event_contrast_filtered_tree$pic < 0.5),]
## Removing NA Events and collecting nodes contrasts of newly assigned events
nodes_contrast_random_events <- Random_event_contrast[!is.na(Random_event_contrast$event_new),]
nodes_contrast_random_event_filtered_tree <- Random_event_contrast_filtered_tree[!is.na(Random_event_contrast_filtered_tree$event_new),]
nodes_contrast_random_events$event_new <- factor(nodes_contrast_random_events$event_new, levels=c("Speciation", "Duplication"))
nodes_contrast_random_event_filtered_tree$event_new <- factor(nodes_contrast_random_event_filtered_tree$event_new, levels=c("Speciation", "Duplication"))
## Collecting PICs and expected variances of events after randomization
speciation_pic_revent <- abs(nodes_contrast_random_events$pic[which(nodes_contrast_random_events$event_new=="Speciation")])
duplication_pic_revent <- abs(nodes_contrast_random_events$pic[which(nodes_contrast_random_events$event_new=="Duplication")])
speciation_pic_rvar <- nodes_contrast_random_events$var_exp[which(nodes_contrast_random_events$event_new=="Speciation")]
duplication_pic_rvar <- nodes_contrast_random_events$var_exp[which(nodes_contrast_random_events$event_new=="Duplication")]
## Performing test
wilcox_1_tailed_revent <- one_tailed_wilcox(duplication_pic_revent,speciation_pic_revent)
wilcox_2_tailed_revent <- two_tailed_wilcox(duplication_pic_revent,speciation_pic_revent)
## Median PICs of randomized events of all calibrated trees
median_random_event <- NULL
median_random_event <- aggregate(pic~event_new, nodes_contrast_random_events, median)
median_random_event$pic.round <- paste0("Median = ",round(median_random_event$pic,4),sep="")
## Median PICs of randomized events of trees with atleast one duplication
median_nodes_contrast_random_event_ftree <- NULL
median_nodes_contrast_random_event_ftree <- aggregate(pic~event_new, nodes_contrast_random_event_filtered_tree, median)
median_nodes_contrast_random_event_ftree$pic.round <- paste0("Median = ",round(median_nodes_contrast_random_event_ftree$pic,4),sep="")
```
```{r additional_analyses, eval=T, echo=F, message=F, warning=F}
## Collecting tree data (numner of tips, internal node events proportions) for all calibrated trees in a single dataframe
tree_data_all <- bind_rows(lapply(gene_trees_pic,tree_data_collection))
tree_data_plot <- na.omit(tree_data_all) ## Removing data for trees having no duplication events
## Dataframe for scatter plot
data_scatterplot <- data.frame(ntip=NA,Expected_variance=NA,Event=NA)
data_scatterplot <- rbind(data_scatterplot, data.frame(ntip=tree_data_plot$tip_num, Expected_variance=tree_data_plot$spe_var,Event="Speciation"))
data_scatterplot <- rbind(data_scatterplot, data.frame(ntip=tree_data_plot$tip_num,Expected_variance=tree_data_plot$dup_var,Event="Duplication"))
data_scatterplot <- data_scatterplot[-1,]
data_scatterplot$Event <- factor(data_scatterplot$Event, levels=c("Speciation", "Duplication"))
## Plotting number of tips vs. expected variance of events
event_variance <- ggplot(data_scatterplot,aes(x=ntip))+
geom_line(aes(y=Expected_variance, color=Event),alpha=0.5)+
xlab(expression(bold("Number of tips"))) +
ylab(expression(bold("Expected variance"))) +
xlim(0,180) +
ylim(0,2000) +
theme_classic() +
theme(legend.title=element_blank(),legend.position=c(0.7,0.8)) +
theme(axis.text=element_text(size=10,face="bold")) +
ggtitle(expression(bold("B. Variance distribution for events")))+
theme(plot.title = element_text(face="bold"))
## Creating dataframe for plotting events proportion distribution for all 8520 calibrated trees
plot_data <- data.frame(proportion_event=NA,Event=NA)
plot_data <- rbind(plot_data, data.frame(proportion_event=tree_data_all$spe_prop,Event="Speciation"))
plot_data <- rbind(plot_data, data.frame(proportion_event=tree_data_all$dup_prop,Event="Duplication"))
plot_data <- plot_data[-1,]
plot_data$Event <- factor(plot_data$Event, levels=c("Speciation", "Duplication"))
## Histogram to show distribution of proportion of events for all calibrated trees
event_histogram <- ggplot2_histogram_mod(data=plot_data, xName='proportion_event',
groupName='Event',legendPosition="right",
alpha=1, addDensity=TRUE,
addMedianLine=TRUE, medianLineColor=c("#F8766D","#00BFC4"), medianLineSize=1) +
theme_classic() +
ggtitle(expression(bold("D. Event proportion plot")))+
theme(plot.title = element_text(face="bold"))+
xlab("Proportion of events") +
ylab("Density") +
theme(legend.title=element_blank(),legend.position=c(0.9,0.8),legend.text=element_text(size=8)) +
theme(axis.text=element_text(size=10,face="bold")) +
theme(plot.title = element_text (face="bold",
colour="black", lineheight=1.0),
axis.title.x=element_text( face="bold",
colour="black", hjust=0.5),
axis.title.y=element_text(face="bold",
colour="black", vjust=0.5, angle=90))
```
We used randomization tests to assess whether the results of different analyses of the empirical dataset are reliable and unbiased. In a first randomization test, we randomized the trait values (i.e. randomly permuting the $\tau$ values) across the tips of each tree without altering the node events of the trees. We then computed the contrasts for the speciation and duplication events of the trees when there is no relation between trait values and phylogenetic relationships. When we compared the nodes contrasts of speciation and duplication events of these randomized trees (Fig 2A), we found the same pattern as reported for the empirical gene trees by Dunn et al. [@RN760]. This shows that the PIC applied to these data is not measuring phenotypic evolution. It confirms that results are driven by their large differences in branch lengths (i.e. in expected variances) (Fig 2B), as on simulated null data. Any effect of trait divergence rates of speciation and duplication events is masked by this branch length difference. This violates the basic assumption of applicability of the PIC method to Brownian trait evolution. To remove the problem of difference in expected variances of the two events, we performed a second randomization test: we kept the original $\tau$ value for tips but randomly shuffled the events (duplication or speciation) of internal nodes of the empirical gene trees to maintain the original proportions of speciation and duplication events. The resulting trend (Fig 2C) still resembled the empirical gene trees data of Dunn et al. [@RN760]. This appears due to the fact that the majority of the tree events are speciation events (Fig 2D) with speciation node ages $\leq$ 296 My. Most of the trees with many duplication events on the other hand have ancient duplication events (duplication node age > 296 My) for which the evolutionary rates of duplication are often masked by the effect of longer branch lengths. Hence, opposite to our expectation, the calibrated trees with no or few duplications have higher overall nodes contrast (apparent fast evolution) than trees with many duplications (apparent slow evolution). This might be due to greater difficulty in detecting paralogs for fast evolving genes. Therefore, reshuffling of the events may not change the observed pattern of higher speciation contrasts than duplication contrasts.
Out of `r length(gene_trees_calibrated)` calibrated trees, `r trees_with_no_duplication` were species-only trees with no duplication events. For these `r trees_with_no_duplication` trees, random shuffling of events had no impact. To avoid this bias, we removed those `r trees_with_no_duplication` speciation trees as well as trees with negative branch lengths, and randomized the trait or the internal node events 100 times on the remaining `r length(trees_of_interest)` trees. However, we still always obtained significantly higher contrasts of speciation than of duplication (S5A and S5B Figs). The randomization tests pattern is the same when we used `r subset.trees` trees with strong phylogenetic signals (S5C and S5D Figs). All these analyses indicate that the results reported by Dunn et al. [@RN760] are biased by the available gene trees, and that this bias is not easy to correct.
```{r Fig2, dpi=400,fig.width=8, fig.height=8, fig.cap="**Fig 2**: Analyses on calibrated empirical gene trees of Dunn et al. [@RN760]. *P* values are from Wilcoxon two-tailed tests. (A) Randomly shuffling the $\\tau$ values of the tips for 8520 gene trees does not alter the trend of empirical result [@RN760] showing an opposite trend to the ortholog conjecture. (B) The figure indicates the expected variance is much higher for duplication than speciation events irrespective of the number of tips considered for the study. (C) Using the original $\\tau$ data, if we permute the events (Speciation/Duplication/NA) of the nodes, the trend of result still resembles the empirical result obtained by the recent study [@RN760]. (D) The proportions of speciation events are much higher than duplication events for all time-calibrated trees. The dotted line represents the median proportion of both events. This plot shows a high frequency of trees have no/few duplication events."}
## Plotting the randomization data result for all calibrated trees
## 1st plot
jitterplot_randomized_tau <-
ggplot(nodes_contrast_random,aes(x=Event, y=pic_abs_random, fill=Event)) +
geom_jitter(aes(colour = Event),position=position_jitter(0.02), alpha=0.02) +
geom_text(data = median_nodes_contrast_random, aes(label = pic.round),fontface = 2,size = 4,
hjust=0.5,vjust =-2.5)+
geom_crossbar(data=median_nodes_contrast_random, aes(ymin = pic_abs_random, ymax = pic_abs_random),
size=0.2,col= "black", width = .2)+
xlab( NULL ) +
ylab(expression(bold("Phylogenetic Independent Contrasts (PICs)"))) +
ylim(0, 0.4) +
theme_classic()+
theme(legend.title=element_blank(),legend.position=c(1.9,1.9)) +
theme(axis.text=element_text(size=10,face="bold")) +
theme(legend.text=element_text(size=10,face="bold")) +
annotate("text", x = 1.5, y = 0.3, label= wilcox_2_tailed.r, fontface = 4)+
ggtitle(expression(bold(paste("A. Plot using randomized trait, ", tau))))+
theme(plot.title = element_text(face="bold"))
## 2nd plot
nodes_contrast_random_events$event_new <-factor(nodes_contrast_random_events$event_new, levels=c("Speciation", "Duplication"))
jitterplot_randomized_event <-
ggplot(nodes_contrast_random_events,aes(event_new, y=pic, fill=event_new)) +
geom_jitter(aes(colour = event_new),position=position_jitter(0.02), alpha=0.02) +
geom_text(data = median_random_event, aes(label = pic.round),fontface = 2,size = 4,
hjust=0.5,vjust = -2.5)+
geom_crossbar(data=median_random_event, aes(ymin = pic, ymax = pic),
size=0.2, col= "black",width = .2)+
xlab( NULL ) +
ylab( NULL) +
ylim(0, 0.4) +
theme_classic()+
theme(legend.title=element_blank(),legend.position=c(1.9,1.9)) +
theme(axis.text=element_text(size=10,face="bold")) +
theme(legend.text=element_text(size=10,face="bold")) +
annotate("text", x = 1.5, y = 0.3, label= wilcox_2_tailed_revent, fontface = 4)+
ggtitle(expression(bold(paste("C. Plot using randomized events"))))+
theme(plot.title = element_text(face="bold"))
##Plotting the result
grid.arrange(jitterplot_randomized_tau,jitterplot_randomized_event, event_variance,event_histogram, nrow=2,ncol=2)
```
__The ortholog conjecture when $\tau$ evolution follows a Brownian model__
```{r Analysis_with_Caper_package_without_data_transformation, eval=T, echo=F, message=F, warning=F}
## In this section, we plan to perform diagnostic tests to check for Brownian model of Tau evolution
## This diagnostic test function takes a lot of time, so we previously ran "Premanuscript_run_TM.R" and saved the results
## Stored data are available in the following link https://doi.org/10.5281/zenodo.3354285
## We loaded those stored results for further analyses
load("Data_TMRR.rda")
## Summering results of different lists of BM trees (for details check "Premanuscript_run_TM.R" and S4 Fig)
summary_brownian_tree <- bind_rows(lapply(standarized_pic_tree_with_dup_final,summary_function))
summary_ancient_speciation_tree <- bind_rows(lapply(standarized_pic_tree_ancient_speciation,summary_function))
Contrast_brownian <- summary_brownian_tree[which(!is.na(summary_brownian_tree$pic_Tau)),]
Contrast_ancient_speciation_tree <- summary_ancient_speciation_tree[which(!is.na(summary_ancient_speciation_tree$pic_Tau)),]
## We have checked for maximum PIC for all sets of Brownian trees, and the PIC was always less than 0.5
```
```{r Fig3, dpi=400,fig.width=8, fig.height=4, fig.cap="**Fig 3**: The ortholog conjecture test on $\\tau$ for trees passing diagnostic plot tests with proper controls. *P* values are from Wilcoxon two-tailed tests. 'PICs' refers 'Phylogenetic Independent Contrasts'. Contrasts of speciation and duplication events reject the null hypothesis with different node age limits for the trees following Brownian model of $\\tau$ evolution, and provide support for the ortholog conjecture."}
## Analysis considering duplication age <= oldest speciation age
speciation_pic_nodeage <- Contrast_brownian$node_age[which(Contrast_brownian$Event=="Speciation")]
Contrast_brownian_plot <- Contrast_brownian[which(Contrast_brownian$node_age <= max(speciation_pic_nodeage)),]
# Considering the relevant columns for plotting and removing NA events
Contrast_brownian_plot <- na.omit(Contrast_brownian_plot[c(6,18:19,22:24)])
#Contrast_brownian_plot$pic_Tau<-abs(Contrast_brownian_plot$pic_Tau) ## Taking the absolute value
Contrast_brownian_plot$Events <- factor(Contrast_brownian_plot$Event, levels=c("Speciation", "Duplication"))
# Extracting speciation and duplication PICs
speciation_contrast_tau <- abs(Contrast_brownian_plot$pic_Tau[which(Contrast_brownian_plot$Events=="Speciation")])
duplication_contrast_tau <- abs(Contrast_brownian_plot$pic_Tau[which(Contrast_brownian_plot$Events=="Duplication")])
## Performing test
wilcox_test_tau <- two_tailed_wilcox(duplication_contrast_tau,speciation_contrast_tau)
title <- paste0("Plots on contrasts standardized ",length(standarized_pic_tree_with_dup_final), " gene trees", sep="")
## 2nd analysis considering duplication age is 25% higher than oldest speciation age
nodeage_limit <- max(speciation_pic_nodeage)+ 0.25*max(speciation_pic_nodeage)
Contrast_brownian_plot2 <- Contrast_brownian[which(Contrast_brownian$node_age <= nodeage_limit),]
Contrast_brownian_plot2 <- na.omit(Contrast_brownian_plot2[c(6,18:19,22:24)])
Contrast_brownian_plot2$Events <- factor(Contrast_brownian_plot2$Event, levels=c("Speciation", "Duplication"))
# Separating speciation and duplication contrast data
speciation_contrast_tau2 <- abs(Contrast_brownian_plot2$pic_Tau[which(Contrast_brownian_plot2$Events=="Speciation")])
duplication_contrast_tau2 <- abs(Contrast_brownian_plot2$pic_Tau[which(Contrast_brownian_plot2$Events=="Duplication")])
## Performing test
wilcox_test_tau2 <- two_tailed_wilcox(duplication_contrast_tau2,speciation_contrast_tau2)
## Making a dataframe for plotting
plot_df <- data.frame(pic=NA,var=NA, group=NA, Event=NA) ## declaring data frame
plot_df <- rbind(plot_df, data.frame(pic=abs(Contrast_brownian_plot$pic_Tau), var= Contrast_brownian_plot$variance, group="Age <= 296 My", Event=Contrast_brownian_plot$Events))
plot_df <- rbind(plot_df, data.frame(pic=abs(Contrast_brownian_plot2$pic_Tau), var= Contrast_brownian_plot2$variance, group="Age <= 370 My", Event=Contrast_brownian_plot2$Events))
plot_df <- plot_df[-1,]
plot_df$Event <- factor(plot_df$Event, levels=c("Speciation", "Duplication"))
## Boxplot
dodge <- position_dodge(width = 0.51)
ggplot(plot_df, aes(group, pic, fill = Event)) +
geom_boxplot(width=0.5,outlier.colour=NA, position=dodge, notch=T) +
#geom_jitter(aes(colour = Event),position=dodge, alpha=0.02) +
xlab(NULL) +
ylab(expression(bold(paste("PICs of ",tau)))) +
coord_cartesian(ylim=c(0, 0.06)) +
theme_classic()+
theme(legend.title=element_blank(),legend.position=c(0.9,0.9)) +
#theme(legend.position="none")+
theme(axis.text.y = element_text(size=10)) +
theme(axis.text.x = element_text(size=10,face="bold")) +
ggtitle(paste0(title))+
theme(plot.title = element_text(face = "bold"))+
annotate("text", x=1.04, y=0.05, label=wilcox_test_tau, fontface=4) +
annotate("text", x=1.98, y=0.05, label=wilcox_test_tau2, fontface=4)
```
Following a Brownian model (BM), the small and large values of standardized contrasts should be equally as likely to occur on any node of the tree [@RN783]. Diagnostic plot tests (details in the Methods) for each tree can indicate whether trait evolution follows BM for that tree [@RN781;@RN782;@RN779;@RN780;@RN783;@RN816]. Since BM is intrinsic to the implementation of PIC method, we identified trees that passed diagnostic plot tests at 95% confidence level for $\tau$. Hence, we used a subset of `r length(standarized_pic_tree_with_dup_final)` gene trees by removing trees with negative branch lengths, pure speciation trees, and trees that failed diagnostic tests for BM (S6 Fig). Moreover, to meet the correct branch length assumption of PIC applicability on trees with gene duplications, we used limitations of node age on gene duplication. It is necessary to put such a limitation to avoid biases due to inaccurate calibration of very old duplication nodes. Analyses on `r length(standarized_pic_tree_with_dup_final)` these trees with a node age limit of `r max(speciation_pic_nodeage)` My (i.e. node age $\leq$ maximum speciation age), provided support for the ortholog conjecture (Fig 3), in contrast to previous results. We repeated the analysis including slightly older duplication events (duplication age $\leq$ `r round((max(speciation_pic_nodeage)+ 0.25*(max(speciation_pic_nodeage))),0)` My, i.e. maximum 25% older than the oldest speciation). We still were able to detect higher contrasts for duplication than for speciation (Fig 3). Finally, randomization tests on these trees indicated no bias in the inference (S7A-S7D Figs), supporting the relevance of the inferences on empirical data.
Among these `r length(standarized_pic_tree_with_dup_final)` trees (S6 Fig), very old duplication node contrasts were also involved to identify contrast distribution of $\tau$ as per BM, which might bias the analysis. Hence, we repeated the identification of BM trees by standard diagnostic plot tests considering only `r length(trees_of_interest)` trees with at least one duplication event, after removing very old duplication nodes (duplication age > `r round((max(speciation_pic_nodeage)+ 0.25*(max(speciation_pic_nodeage))),0)` My) (S6 Fig). We still were able to find support for the ortholog conjecture on these identified `r length(standarized_pic_tree_with_dup_final_new)` BM trees (S8 Fig).
To avoid calibration bias of older duplication events, we also considered a more restricted set of `r length(standarized_pic_tree_ancient_speciation)` passing more strict conditions (S6 Fig), i.e. BM trees in which there is at least one duplication, and a speciation event older than all duplications. Calibration bias for duplication events is expected to be very limited for such trees. Support for the ortholog conjecture still holds with these `r length(standarized_pic_tree_ancient_speciation)` trees (Fig 4A). Randomization tests on these `r length(standarized_pic_tree_ancient_speciation)` trees also support the biological relevance of these results (Figs 4B and 4C,S9 Fig). These observations confirm that the ortholog conjecture is supported for adequately standardized BM trees.
```{r reanalyses_Dunn_data_BM_OU,echo=F, message=F, warning=F}
## Testing the ortholog cojecture with nodes contrasts of 3151 Brownian trees provided by Dunn et al.
speciation_BM_pic <- nodes_contrast_filtered_bm$pic[which(nodes_contrast_filtered_bm$Event=="Speciation")]
duplication_BM_pic <- nodes_contrast_filtered_bm$pic[which(nodes_contrast_filtered_bm$Event=="Duplication")]
## Performing test
wilcox_one_tailed_BM <- wilcox.test(duplication_BM_pic,speciation_BM_pic,alternative = "greater")$p.value
wilcox_two_tailed_BM <- wilcox.test(duplication_BM_pic,speciation_BM_pic)$p.value
## Identifying the index of BM trees of Dunn et al. from their all time calibrated trees
## and extracting them based on their criteria for further use
tree_summary$index <- as.numeric(rownames(tree_summary))
genes_pass_BM <- tree_summary$index[tree_summary$dAIC_bm < 2]
BM_trees_model_fit <- gene_trees_calibrated[c(genes_pass_BM)]
```
Instead of using standard diagnostic plot tests [@RN781;@RN782;@RN779;@RN780;@RN783;@RN816], Dunn et al. [@RN760] considered model-fitting criteria to identify BM trees from their list of `r length(gene_trees_calibrated)` calibrated trees. They contrasted the BM model (neutral drift model with no selection) to the Ornstein-Uhlenbeck (OU) model (i.e. an extended BM model with stabilizing selection) [@RN843;@RN841] for this purpose. After computing the difference in Akaike Information Criterion ($\Delta$AIC) [@RN380] scores of the models for each tree, they identified 3151 BM trees, out of which only 8 trees purely favored the BM model ($\Delta$AIC~BM~ < 2 and $\Delta$AIC~OU~ $\ge$ 2). The other 3143 trees favored both the BM and OU models ($\Delta$AIC~BM~ < 2 and $\Delta$AIC~OU~ < 2). A Wilcoxon two-tailed test on those 3151 trees provided similar results to the `r length(gene_trees_calibrated)` calibrated trees, i.e. higher $\tau$ evolution for orthologs than for paralogs (PIC~speciation~ = `r round(median(speciation_BM_pic),4)`, PIC~duplication~ = `r round(median(duplication_BM_pic), 4)`, *P* = $`r format(wilcox_two_tailed_BM, digits= 3, scientific = TRUE)`$). This result implies that selection of BM or non-BM trees does not explain these results. A similar trend was observed for the recent duplicates (node age $\le$ `r max(speciation_pic_nodeage)` My) (S9A Fig). The pattern of systematic increase of duplication PICs with node age (S10A Fig) plausibly implies that the evolution of $\tau$ does not in fact follow Brownian motion for these trees. Randomization tests on those trees (S10B and S10C Figs) confirmed that contrasts were not appropriately standardized as per BM, and thus the inference drawn on empirical data was not supported. When we performed diagnostic plot tests on those 3151 trees, we found support for the ortholog conjecture for `r length(BM_tree_diagnostic_test_final)` trees passing the diagnostic tests (S11 Fig). All these analyses suggest that Dunn et al. did not identify accurately BM trees, and that such misidentification leads to violating the assumptions of phylogenetic comparative methods, and to erroneous results.
```{r Analyses_on_standardized_806_trees, eval=T, echo=F, message=F, warning=F}
## Considering the relevant columns and removing the NA data
Contrast_brownian_806_trees <- na.omit(Contrast_ancient_speciation_tree[c(6,18:19,22:24)])
Contrast_brownian_806_trees$Event <- factor(Contrast_brownian_806_trees$Event, levels=c("Speciation", "Duplication"))
## Taking the absolute values of PIC of Tau
Contrast_brownian_806_trees$pic_Tau <- abs(Contrast_brownian_806_trees$pic_Tau)
## Extracting speciation and duplication PICs for these 806 trees passing diagnostic plots and having ancient speciation event
speciation_pic_tau_806 <- speciation_contrast(Contrast_brownian_806_trees,'pic_Tau')
duplication_pic_tau_806 <- duplication_contrast(Contrast_brownian_806_trees,'pic_Tau')
## Median value collection for Tau of the standarized trees
median_tau_806 <- median_jitter((Contrast_brownian_806_trees),'pic_Tau','Event')
colnames(median_tau_806)[1] <- 'Event'
colnames(median_tau_806)[2] <- 'pic_Tau'
## Performing test
p_tau_806 <- paste0("Two-tailed test, ", two_tailed_wilcox(duplication_pic_tau_806,speciation_pic_tau_806))
```
```{r Additional_analyses_on_806_BM_trees, eval=T, echo=F, message=F, warning=F}
## Collecting tree data for all calibrated trees in a single dataframe
tree_data_806<-bind_rows(lapply(standarized_pic_tree_ancient_speciation,tree_data_collection))
median_number_of_tips<-median(tree_data_806$tip_num)
median_number_of_events<-median(tree_data_806$internal_events)
median_proporion_of_speciation<-median(tree_data_806$spe_prop)
median_proporion_of_duplication<-median(tree_data_806$dup_prop)
median_proporion_of_NA_event<-median(tree_data_806$NA_prop)
```
```{r Fig4, dpi=400,fig.width=8, fig.height=8, fig.cap= paste("**Fig 4**: Results on real and randomized data for ", length(standarized_pic_tree_ancient_speciation), " BM trees with ancient speciation events. *P* values are from two-tailed Wilcoxon tests. (A) The jitter plot on empirical gene tree data supports the ortholog conjecture. Main plots in (B) and (C) show the distributions of two-tailed test *P* values estimated for the whole set of ", length(standarized_pic_tree_ancient_speciation), " BM trees by permuting $\\tau$ (B), and by permuting the internal node events (C) ('Speciation', 'Duplication', and 'NA') for 1000 independent runs respectively. Inset plots in (B) and (C) show *P* value distributions plots of one-tailed test to check if there was an excess of duplication contrasts over speciation contrasts by randomizing $\\tau$, and events respectively. The proportions of speciation (median = ", median_proporion_of_speciation, "), and NA events (median = ", median_proporion_of_NA_event, ") are much higher than that of duplication events (median = ", median_proporion_of_duplication, ") in these trees (S9 Fig). Hence, higher duplication contrasts are more likely to be replaced by speciation events by permutations of events. Due to this reason, we found significant shifts towards left in the main plot of C, and towards *P* value 1 in the inset plot of C to depict larger contrasts for speciation events. Comparisons of plot A to plot B, and plot A to plot C show that randomization results in both the cases actually differ from the empirical result.")}
## Jitter plot with empirical data
plot4AM <- jitter_plot_tau(Contrast_brownian_806_trees,median_tau_806,p_tau_806)+
ylab(expression(bold(paste("PIC of ",tau))))+
ggtitle(expression(bold(paste("A. Empirical result"))))
## Dataframe of P values from randomization tests on 806 trees
pval_dataframe_806 <- data.frame(pval_tau_2sided=NA,pval_tau_1sided=NA,pval_event_2sided=NA,pval_event_1sided=NA,tree_set=NA)
pval_dataframe_806 <- rbind(pval_dataframe_806, data.frame(pval_tau_2sided=pval,pval_tau_1sided=pval_1sided,pval_event_2sided=pval_event,pval_event_1sided=pval_event_1sided,tree_set="806_tree"))
pval_dataframe_806 <- pval_dataframe_806[-1,]
## Plots on P value distribution using randomized Tau
plot4BM <- histo_plot(pval_dataframe_806,'pval_tau_2sided','tree_set') +
xlab(expression(bold(paste( bolditalic(P)," value (two-tailed test)"))))
plot4BM <- plot4BM +
ylim(0,150) +
ggtitle(expression(bold(paste("B. Randomized ", tau))))+
theme(plot.title = element_text(face="bold"))+
theme(axis.text=element_text(size=10,face="bold"))
plot4CM <- histo_plot(pval_dataframe_806,'pval_tau_1sided','tree_set') +
xlab(expression(bold(paste( bolditalic(P)," value (one-tailed test)"))))
plot4CM <- plot4CM +
ylim(0,150) +
#ylab(" ")+ggtitle(expression(bold(paste("C. Owo sided tests on randomized ", tau))))+
theme(plot.title = element_text(face="bold"))+
theme(axis.text=element_text(size=10,face="bold"))
## Plots on P value distribution using randomized internal node events
plot4DM <- histo_plot(pval_dataframe_806,'pval_event_2sided','tree_set') +
xlab(expression(bold(paste( bolditalic(P)," value (two-tailed test)"))))
plot4DM <- plot4DM +
ylim(0,150) +
ggtitle(expression(bold(paste("C. Randomized events"))))+
theme(plot.title = element_text(face="bold"))+
theme(axis.text=element_text(size=10,face="bold"))
plot4EM <- histo_plot(pval_dataframe_806,'pval_event_1sided','tree_set') +
xlab(expression(bold(paste( bolditalic(P)," value (one-tailed test)"))))
plot4EM <- plot4EM +
ylim(0,150) +
#ylab(" ")+
#ggtitle(expression(bold(paste("E. One sided tests on randomized events"))))+
theme(plot.title = element_text(face="bold"))+
theme(axis.text=element_text(size=10,face="bold"))
## Preparing final plots and arranging them
plot4B <- ggdraw() +
draw_plot(plot4BM)+
draw_plot(plot4CM, x = .5, y = .5, width = .5, height = .5)
plot4C <- ggdraw() +
draw_plot(plot4DM)+
draw_plot(plot4EM, x = .5, y = .5, width = .5, height = .5)
# grid.arrange(plot4AM,grid.arrange(plot4BM,plot4CM,plot4DM,plot4EM,nrow = 2, ncol=2))
gridExtra::grid.arrange(plot4AM,plot4B,plot4C,nrow = 3)
```
\pagebreak
### Discussion
We agree with Dunn et al. [@RN760] that evolutionary comparisons should be done considering a phylogenetic framework when possible. However, this does not imply that phylogenetic independent contrasts can be applied easily to phylogenomics data and questions. To get a clear picture, we limited our study to the same phylogenetic gene trees used by Dunn et al. [@RN760]. Our reanalysis identified problems generated during the time calibration of duplication nodes of pruned trees for which no external time references are available, and with non respect of the hypothesis of Brownian motion. The strongest bias was for duplication nodes preceding the oldest speciation nodes. This, in turn, introduced several biases in their analyses, and influenced their results. When we identified and controlled for such biases, PIC results changed to support the ortholog conjecture, consistent with our previous pairwise approach [@RN762] on the same $\tau$ data.
To our knowledge, no one before Dunn et al. [@RN760] applied a phylogenetic approach in comparative biology to study the effect of gene duplication on functional evolution, despite an early call to do so [@RN837]. In line with previous studies [@RN782;@RN851], we explored whether the application of a phylogenetic method might inflate errors (e.g. rejection of the null hypothesis in null condition, Figs 1A and 1B) if applied without thorough testing for the fundamental assumptions of the method. Assumptions of proper branch length information and of Brownian model of trait evolution are related, so that modifications of branch lengths can change the evolutionary model [@RN851]. We find that branch length calibration is mostly inaccurate for old duplication events (duplication events prior to the oldest speciation event in the data) (S2 Fig). Use of such node contrasts causes a strong rise in expected variance for duplication events compared to the speciation events (S4 Fig). This may bring about lack of statistical power to detect the signal of ortholog conjecture, when in reality the signal is present, and even bias towards a pseudo-signal. As a remedial measure, we limited our analysis to the trees for which Brownian motion of trait evolution has been identified with the aid of standard diagnostic plot tests [@RN781;@RN782;@RN779;@RN780;@RN783;@RN816]. To remove issues with branch length inaccuracies for older duplication events, we used limitation of node ages for duplication events. Using all such measures to control for biases, we found support for the ortholog conjecture (Figs 3 and 4A). The reliability of our inference was validated by two different randomization tests, which confirmed the fact that our result was not due to biases in the data or analysis (S7A-S7D Figs, Figs 4B and 4C).
Being aware of the fact that branch lengths are fundamental to phylogenetic comparative methods, Dunn et al. [@RN760] added random noise in the speciation calibration time point, and extended terminal branch lengths to test sensitivity of their observations. While extending terminal branch can reduce error rate due to the presence of negative branch lengths in `r length(trees_negative)` trees, none of the modifications appear to avoid biases when there are so vast differences in branch lengths [@RN853;@RN854;@RN855;@RN856;@RN852;@RN858;@RN859]. These can also lead to violating the Brownian model of trait evolution [@RN851]. Diagnostic plot tests appear necessary to assess adequate standardization of contrasts as per BM in such cases [@RN781;@RN782;@RN779;@RN780;@RN783;@RN816].
The importance of proper BM tree selection should not be underestimated as it can impact the contrast analysis. There exist no fixed protocols to select BM trees. We used standard diagnostic plots recommended in earlier studies for PIC [@RN781;@RN782;@RN779;@RN780;@RN783;@RN816;@RN813;@RN825;@Rcore] to rule out significant departure from BM for each tree individually at 95% confidence level. In contrast, Dunn et al. [@RN760] used both BM and OU as null models for each tree, leading them to identify "BM trees" for which the two models were equally good fits. These do not appear to be proper BM trees according to several observations, since duplication contrasts increased with node age (S10A Fig). Moreover, the inferences based on such trees with empirical data were not different from that based on randomized trees (S10B and S10C Figs). When we applied diagnostic tests on those 3151 "BM trees", `r (length(BM_trees_model_fit)-length(BM_tree_diagnostic_test_final))` trees failed the diagnostic test, and rest of the trees provided support for the ortholog conjecture (S11 Fig).
Empirical support for the ortholog conjecture has been mixed, with most studies supporting, and a few failing to do so [@RN770;@RN790;@RN793;@RN794;@RN762]), our results provided support for the ortholog conjecture using large-scale genome wide tissue specificity data in a phylogenetic framework after controlling for bias. Due to lack of detailed functional information, many studies are still limited to gene expression data as a proxy of function. Recently, using functional replaceability assay, experimental studies [@RN857;@RN861] have shown that orthologous genes can be swapped between essential yeast genes and human, although this is rarely the case for all the members of expanded human gene families [@RN861], validating one prediction of the ortholog conjecture.
These analyses are mainly based on gene trees dominated by small scale duplication events. In these trees, the age of the same duplication clade is never fixed. Use of time calibrated whole genome duplication trees could be beneficial in this regard. Since the approximate time period of whole genome duplications are known, we can test the hypothesis more conveniently by avoiding use of any node age limits on whole genome duplication events for those trees.
\pagebreak
### Methods
__Resource details for reanalyses__
Our reanalyses are based on 8520 annotated (with the events: speciation, duplication and NA), pruned, and time calibrated ENSEMBL Compara v.75 [@RN806] gene trees, having at least 4 tips with non null trait data. They were obtained from Dunn *et al* [@RN760]. Like them [@RN760], we used a subset of precomputed $\tau$ data (as trait) of 8 vertebrate species from the study of Kryuchkova-Mostacci and Robinson-Rechavi [@RN762]. Kryuchkova-Mostacci and Robinson-Rechavi [@RN762] computed $\tau$ and mean gene expression levels by following the method of Yanai et al. [@RN215]. The subset of data was based on the RNA-seq data provided by Brawand et al. [@RN838] for 6 tissues. We used the same random seed number as in [@RN760] to reproduce the simulation results for reanalysis. All reproduced data of Dunn et al. were stored in the "manuscript_dunn.RData" file (https://doi.org/10.5281/zenodo.3354285). We used the results stored in the 'data' or 'phylo' slot of the trees for further analyses. To differentiate our own function from theirs [@RN760], we remodified the original function script of Dunn et al. from "functions.R" to "functions_Dunn.R". Some of the analyses were time consuming, so we made a separate script "Premanuscript_run_TM.R" to run before knitting the markdown file. We stored the outputs in ???Data_TMRR.rda??? file (https://doi.org/10.5281/zenodo.3354285) and loaded it during our analyses. All the details of different functions were provided inside the script. We supply all the previously stored data (to reduce computation time during reproduction of result) and function files including our own ("function_TM_new.R") with this manuscript. All scripts are available on GitHub: https://github.com/tbegum/Testing_the_ortholog_conjecture.
We used phylosig function() of "phytools" package [@RN804] to identify trees with phylogenetic signal (*P* < 0.05) using Blomberg's K [@RN819;@RN804;@RN800].
Analyses and plotting were performed in R version 3.5.1 [@Rcore] using treeio [@RN823], geiger [@RN796], stringr [@RN900], digest[@RN909], dplyr [@RN908], tidyverse [@RN904], ggrepel[@RN910], gtools [@RN902], ggplot2 [@RN903], cowplot [@RN905], easyGgplot2 [@RN901], gridExtra [@RN906], and png [@RN907] libraries.
__Randomization test of $\tau$ values__
For each tree, we used $\tau$ data (column name "Tau" in each tree 'data' object) across the tips to carry out our randomization test. To randomize we permuted the actual $\tau$ data without altering internal node events. The pic() function of the "ape" package [@RN803] was used to compute PIC of nodes for each tree using permuted $\tau$ of tips. For each run, we compared the contrasts of speciation and duplication events of the whole set of randomized trees to estimate difference in event contrasts based on Wilcoxon signed rank test. For 100 or 1000 runs, we repeated the above process 100 or 1000 times to obtain a distribution plot of 100 or 1000 independent *P* values.
__Randomization test of node events__
Some of the speciation nodes had daughters with same clade names in the gene trees we used for our study. Dunn et al. changed such node events to "NA" to avoid problems during time calibration of the trees. Such annotated node event information ("Speciation", "Duplication", "NA") for each tree was available as "Event" in the tree 'data' slot. To randomize, we permuted the internal node events (added as column name "event_new" in the 'data' slot) by maintaining the actual proportion of events for each tree. Then, we used the PIC of actual $\tau$ at tips to estimate contrasts difference between newly assigned speciation and duplication node events by Wilcoxon rank tests. For 100 or 1000 independent runs, we repeated the same procedure to obtain 100 or 1000 independent *P* values.
__Checking for Brownian model of trait evolution__
There exists no established protocol to identify the BM (Brownian model) trees. Dunn et al. used model-fitting with $\Delta$AIC~BM~ = AIC~BM~ - min(AIC~BM~,AIC~OU~) and $\Delta$AIC~OU~ = AIC~OU~ - min(AIC~BM~,AIC~OU~). They applied $\Delta$AIC~BM~ < 2 as the relative support for the BM model. We used several diagnostic tests to test BM behavior, as recommended in several studies [@RN851;@RN779;@RN781;@RN782;@RN780;@RN783]. Among diagnostic tests, the most usual method for contrasts standardization is to check a correlation between the absolute values of standardized contrasts and their expected standard deviations [@RN816;@RN781;@RN782]. Under Brownian motion, there should be no correlation. This test and the correlation between the absolute values of standardized contrasts and the logarithm of their node age are model diagnostic plot tests in the caper ("Comparative Analyses of Phylogenetics and Evolution in R") package [@RN781;@RN813;@RN825; @Rcore]. We used both of them in our study by using the "crunch" algorithm of the "caper" package in R, which implements the methods originally provided in CAIC [@RN781; @RN813;@RN825; @Rcore]. Correlation of node heights with absolute values of contrasts has also been reported to be a reliable indicator of deviation from the Brownian model [@RN783]. Hence, we computed node height for each node in a tree using the ape package [@RN803]. We also used the correlations of node height and node depth to the absolute value of nodes contrasts to rule out significant trend in any of the 4 tests. We used *P* < 0.05 to assess a significant correlation for the diagnostic plot tests. A significant trend (positive or negative) reliably indicates a deviation from the BM of trait evolution for that tree [@RN781;@RN782;@RN779;@RN780;@RN783;@RN816], and we removed those trees from our analysis. Contrast calculation on negative branch lengths is not desirable, so we removed trees with negative branch lengths before applying the crunch() function. Trees passing all 4 diagnostic tests were considered as BM trees for our study.
### Acknowledgements
We sincerely acknowledge Martha Liliano Serranno Serranno for initial help in understanding the phylogenetic independent contrast method. We thank Nicolas Salamin, Julien Wollbrett, Jialin Liu, Sebastien Moretti, Sara Fonseca Costa, Kamil Jaron and all the members of the Robinson-Rechavi group for their help and useful discussions. We also acknowledge Cassey Dunn for his initial help in reproducing their results. Parts of the computations were performed at the Vital-IT (http://www.vital-it.ch) Center for high-performance computing of the SIB Swiss Institute of Bioinformatics.
\pagebreak
## Supporting Information
```{r Analysis_all_trees_with_different_random_seed_number,eval=T, echo=F, message=F, warning=F}
treedata <- function(tree)
{
if(class(tree) == "treedata")
{
tree@data <- tidytree::as_tibble(tree@data)
}
## Reading tree data slot
return(tree)
}
## Applying the function on calibrated trees
gene_trees_calibrated_new <- mclapply(gene_trees_calibrated,treedata, mc.cores = cores)
## Null simulation using Dunn et al. function
## Changing random seed number
set.seed(123)
gene_trees_sim_null_8520 <- foreach( tree=gene_trees_calibrated_new ) %dopar%
sim_tau( tree )
gene_trees_sim_null_8520 %<>% add_pics_to_trees()
## OC simulation using Dunn et al. function
set.seed(123)
gene_trees_sim_oc_8520 <- foreach( tree=gene_trees_calibrated_new ) %dopar%
sim_tau( tree, dup_adjust=2)
gene_trees_sim_oc_8520 %<>% add_pics_to_trees()
## Making a dataframe by combining data slots of all these 8520 trees
sim_null_contrast_8520_new <- bind_rows(lapply(gene_trees_sim_null_8520,summary_function))
sim_oc_contrast_8520_new <- bind_rows(lapply(gene_trees_sim_oc_8520,summary_function))
## Extracting the columns of event and PIC data from null and OC simulation result of trees with phylogenetic signals
null_data_8520_new <- na.omit(sim_null_contrast_8520_new[c(6,20)])
null_data_8520_new$label <- "Null simulation"
OC_data_8520_new <- na.omit(sim_oc_contrast_8520_new[c(6,20)])
OC_data_8520_new$label <- "OC simulation"
## Extracting speciation and duplication PICs from null and OC simulation of all calibrated trees
speciation_null_8520 <- null_data_8520_new$pic[which(null_data_8520_new$Event=="Speciation")]
duplication_null_8520 <- null_data_8520_new$pic[which(null_data_8520_new$Event=="Duplication")]
speciation_oc_8520 <- OC_data_8520_new$pic[which(OC_data_8520_new$Event=="Speciation")]
duplication_oc_8520 <- OC_data_8520_new$pic[which(OC_data_8520_new$Event=="Duplication")]
## Performing Wilcoxon tests on null and OC simulation data of all calibrated trees
wilcox_2_tailed_null_8520_new <- two_tailed_wilcox(speciation_null_8520,duplication_null_8520)
wilcox_2_tailed_oc_8520_new<- two_tailed_wilcox(speciation_oc_8520,duplication_oc_8520)
```
```{r Fig_S1, dpi=400,fig.width=8, fig.height=7, fig.cap= paste("S1 Fig: Repeating simulations on all calibrated trees with different random seed number. *P* values are from Wilcoxon two-tailed tests. Simulations with different seed number did not change the trend of results as observed in Fig 1A.")}
## Dataframe is built based on 2896 trees with phylogenetic signals
simulation_8520_new <- rbind(null_data_8520_new, OC_data_8520_new)
### Generating plot on simulation data for all calibrated trees
dodge <- position_dodge(width = 0.51)
ggplot(simulation_8520_new,aes(x=label, y=pic, fill=Event)) +
geom_boxplot(width=0.5,outlier.colour=NA, position=dodge, notch=T) +
xlab(NULL) +
ylab(expression(bold("Phylogenetic Independent Contrasts (PICs)"))) +
coord_cartesian(ylim=c(0, 0.05)) +
theme_classic()+
theme(legend.title=element_blank(),legend.position=c(0.9,0.9),legend.text=element_text(size=8)) +
theme(axis.text.y= element_text(size=10)) +
theme(axis.text.x = element_text(size=10,face="bold"))+
ggtitle(paste0("Plot on 8520 calibrated trees"))+
theme(plot.title = element_text(face = "bold"))+
annotate("text", x = 1.04, y = 0.035, label= wilcox_2_tailed_null_8520_new, fontface=4) +
annotate("text", x = 1.98, y = 0.035, label= wilcox_2_tailed_oc_8520_new, fontface=4)
```
```{r Fig_S2, dpi=400,fig.width=8, fig.height=6, fig.cap= paste("S2 Fig: An example of a calibrated gene trees used by Dunn et al. in their study. The blue nodes represent the speciation nodes while the red nodes are duplication nodes. For nodes with no number the event is assigned as NA by the authors [@RN760]. Since, the duplication time is unknown, the tree is calibrated based on speciation node ages. Notice that this gives an unreasonably large estimated date for the ancient duplication, resulting in higher expected variance and abnormally low contrast for the corresponding node event. This example is to demonstrate that it is often risky to use such nodes to infer the result of phylogenetic method that intends to study the effect of gene duplication.")}
## One example tree
tree <- gene_trees_pic[[247]]
ntips <- length(tree@phylo$tip.label) ## Number of tips for the tree
##Identifying internal speciation and duplication nodes from the 'tree@data' object
nodes_duplication <- tree@data$node[which(tree@data$Event=="Duplication")]
nodes_speciation <- tree@data$node[which(tree@data$Event=="Speciation" & tree@data$node > ntips)]
## Plotting the tree
plotTree(tree@phylo, mar=c(5,2,2,2))
axisPhylo()
nodelabels(node=nodes_speciation,frame = "none", col="blue", font=2)
nodelabels(node=nodes_duplication,frame = "none", col="red", font=2)
mtext("An example of time calibrated trees for two different events",font=2,side=3, line=-1, outer = TRUE)
mtext("Million Years (My)",font=2,side=1, line=-2, outer = TRUE)
```
```{r More_analysis_trees_with_strong_phylogenetic_signal,eval=T, echo=F, message=F, warning=F}
## Removing null and OC simulation trees associated with negative branch lengths
gene_trees_pic1_sim_null <- gene_trees_sim_null[-c(trees_negative)]
gene_trees_pic1_sim_oc <- gene_trees_sim_oc[-c(trees_negative)]
## Index number of trees with strong phylogenetic signal
index_signal_tree <- signal_tree$tree_number[which(signal_tree$Pvalue_Blomberg< 0.05 & signal_tree$K_effecient > 0.551)]
## Collecting null and OC simulation trees associated with phylogenetic signals (P <0.05)
tree_sim_null_1135 <- gene_trees_pic1_sim_null[c(index_signal_tree)]
tree_sim_oc_1135 <- gene_trees_pic1_sim_oc[c(index_signal_tree)]
#gene_trees_sim_null_test <- foreach( tree=tree_sim_null_1135 ) %dopar% sim_tau( tree )
#gene_trees_sim_null_test %<>% add_pics_to_trees()
## Making a dataframe by combining data slots of all these 1135 trees
sim_null_contrast_1135 <- bind_rows(lapply(tree_sim_null_1135,summary_function))
sim_oc_contrast_1135 <- bind_rows(lapply(tree_sim_oc_1135,summary_function))
## Extracting the columns of event and PIC data from null and OC simulation result of trees with phylogenetic signals
null_data.1135 <- na.omit(sim_null_contrast_1135[c(6,20)])
null_data.1135$label <- "Null simulation"
OC_data.1135 <- na.omit(sim_oc_contrast_1135[c(6,20)])
OC_data.1135$label <- "OC simulation"
## Extracting speciation and duplication PICs from null and OC simulation of trees with phylogenetic signals
speciation_null_1135 <- null_data.1135$pic[which(null_data.1135$Event=="Speciation")]
duplication_null_1135 <- null_data.1135$pic[which(null_data.1135$Event=="Duplication")]
speciation_oc_1135 <- OC_data.1135$pic[which(OC_data.1135$Event=="Speciation")]
duplication_oc_1135 <- OC_data.1135$pic[which(OC_data.1135$Event=="Duplication")]
## Performing Wilcoxon tests on null and OC simulation data of trees with strong phylogenetic signals
wilcox_2_tailed_null_1135 <- two_tailed_wilcox(speciation_null_1135,duplication_null_1135)
wilcox_2_tailed_oc_1135 <- two_tailed_wilcox(speciation_oc_1135,duplication_oc_1135)
```
```{r Fig_S3, dpi=400,fig.width=8, fig.height=7, fig.cap= paste("S3 Fig: Simulation analyses on 1135 trees with strong phylogenetic signals. *P* values are from Wilcoxon two-tailed tests. Dunn et al. used a cutoff of K > 0.551 to identify trees with strong phylogenetic signals. However, trees with higher K statistic can reflect trees with no phylogenetic signal when corresponding *P* values are non-significant. Considering both K statistic and *P* value, we found similar trends as was observed with 2082 trees.")}