-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpairwise_infections_two_strain_cross 3.R
2492 lines (1967 loc) · 116 KB
/
pairwise_infections_two_strain_cross 3.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#************* Pairwise coinfection analyses *************#
#### 1. Importing Data and Initial Preparation of Data Frame
#### 2. Antigenic Segment Processing
#### 3. Coinfection Supernatant Titers
#### 4. Population Genetic Statistics
#### 5. Testing Strain Specificity
# Pairwise coinfection analyses Script for Influenza LMGSeq
# Script: pairwise_coinfections.R
# This file is an R script that analyzes strain assignment data from a LMGSeq experiment
# Requires running demultiplexing.sh and amplicon_curation_strain_assignment.sh to generate output files for *pairwise coinfections*:
# ./demultiplexing.sh runA "shared/cross_list_runA_pairwise.txt" pairwise_infections
# ./amplicon_curation_strain_assignment.sh runA "shared/cross_list_runA_pairwise.txt" pairwise_infections
#This script is part of the following manuscript:
#Influenza A virus reassortment is strain dependent
#Kishana Y. Taylor | Ilechukwu Agu | Ivy José | Sari Mäntynen | A.J. Campbell | Courtney Mattson | Tsui-wen Chou | Bin Zhou | David Gresham | Elodie Ghedin | Samuel L. Díaz Muñoz
#AFTER SENDING DRAFT - Search for this below
#Load required libraries
library(dplyr)
library(ggplot2)
library(tidyr)
library(reshape2)
library(readr)
library(gridExtra)
library(ggthemes)
# 1. Importing Data and Initial Preparation of Data Frame ----
#List output file names
file_names <- list.files("outputs/pairwise_infections", "*_strain_database17_98_locus.txt", full.names = T)
#Make a dataframe by recursively reading each output file in the list
two_strain_database17_98_locus <- NULL
for (file in file_names) {
df <- read.table(file, quote="\"", comment.char="")
two_strain_database17_98_locus <- rbind(two_strain_database17_98_locus, df)
}
#Assign names to columns
colnames(two_strain_database17_98_locus) <- c("cross_sample_locus", "total_reads", "majority_assigned_reads", "majority_strain_locus")
#Calculate Proportion of Reads that were assigned to one strain in the coinfection or the other
two_strain_database17_98_locus <- mutate(two_strain_database17_98_locus, proportion_assigned = majority_assigned_reads/total_reads)
#Quick quality control plot looking at relationship between total number of reads and proportion assigned to one strain
qplot(two_strain_database17_98_locus$proportion_assigned, two_strain_database17_98_locus$total_reads)
#Cleaning up text from output files
cross_sample_locus <- gsub("usearch/all/","", two_strain_database17_98_locus$cross_sample_locus)
two_strain_database17_98_locus$cross_sample_locus <- gsub("_98_merged.b6","", cross_sample_locus)
#Then split cross sample locus into different columns, but keep original
two_strain_database17_98_locus <- separate(two_strain_database17_98_locus, cross_sample_locus, c("cross", "sample", "locus"), sep = "_", remove=FALSE)
#Now create a cross_sample column to identify unique samples more easily
two_strain_database17_98_locus <- unite(two_strain_database17_98_locus, cross_sample, c("cross", "sample"), sep = "_", remove=FALSE)
#Now split majority strain and majority locus
two_strain_database17_98_locus <- separate(two_strain_database17_98_locus, majority_strain_locus, c("majority_strain", "majority_locus"), sep = "_", remove=FALSE)
#Check number of rows in two_strain_database17_98_locus
nrow(two_strain_database17_98_locus)
#[1] 7538
#Let's try our first plot
ggplot(two_strain_database17_98_locus, aes(x = locus, y = sample)) + geom_point(aes(col=majority_strain, alpha=proportion_assigned), shape=15, size = 6) + theme(axis.text.x = element_text(angle=90)) + facet_wrap(~ cross)
#Note that this plot has two loci for HA and two for NA. This will be corrected below to assign proper HA and NA strain calls
no_subtype_processing_plot <- ggplot(two_strain_database17_98_locus, aes(x = locus, y = sample)) + geom_point(aes(col=majority_strain, alpha=proportion_assigned), shape=15, size = 6) + theme(axis.text.x = element_text(angle=90)) + facet_wrap(~ cross)
#Okay, now add the information on what each cross (experimental coinfection) is about. This data sheet is included in the repo.
cross_data_runA <- read.csv("~/Dropbox/mixtup/Documentos/ucdavis/papers/influenza_GbBSeq_human/influenza_GbBSeq/data/cross_data_runA.csv")
#Prepare the cross_data_runA df for a merge with the big data set, so we can plot based on infection conditions
cross_data_runA <- mutate(cross_data_runA, cross = paste("cross", cross_id, sep = ""))
#Make a strain combination column for more convenient facetting, note that order will be strainA_strainB
cross_data_runA <- mutate(cross_data_runA, strainAB = paste(strainA, strainB, sep = "_"))
#Join sample information
two_strain_database17_98_locus <- left_join(two_strain_database17_98_locus, cross_data_runA, by = "cross")
#Arrange facets by strain
ggplot(two_strain_database17_98_locus, aes(x = locus, y = sample)) + geom_point(aes(col=majority_strain, alpha=proportion_assigned), shape=15, size = 6) + theme(axis.text.x = element_text(angle=90)) + facet_grid(strainA ~ strainB)
#Summarise number of reads per cross
two_strain_database17_98_locus %>% group_by(strainAB) %>%
summarise(
total_cross_reads = sum(total_reads),
)
#strainAB total_cross_reads
#<chr> <int>
#1 CA09_HK68 1,154,666
#2 CA09_PAN99 664,562
#3 CA09_SI86 907,760
#4 CA09_TX12 1,128,965
#5 HK68_PAN99 1,197,359
#6 HK68_SI86 1,053,568
#7 HK68_TX12 1,383,315
#8 PAN99_SI86 891,395
#9 PAN99_TX12 1,544,278
#10 SI86_TX12 221,086
#Summarize average number of reads per sample in each cross
reads_sample <- two_strain_database17_98_locus %>% group_by(cross_sample, cross) %>%
summarise(
total_cross_reads = sum(total_reads),
)
summary(reads_sample$total_cross_reads)
#Min. 1st Qu. Median Mean 3rd Qu. Max.
#44 5595 10650 10570 15142 33184
#Summarise number of reads per locus
two_strain_database17_98_locus %>% group_by(strainAB, locus) %>%
summarise(
total_locus_reads = sum(total_reads),
average_locus_reads = mean(total_reads),
)
# 2. Antigenic Segment Processing ----
# This section uses information from each of two PCR loci (HA3a, HA1b, NA1c, NA3a) for each antigenic segment (HA and NA)
# to decide which strain to assign to that given segment in each progeny isolate clone (sample).
# Aside from using read data, we also used information from control LMGSeq runs (including single strain LMGSeq to establish thresholds for how primers work behave with just one strain background)
#Total Rows in data frame
nrow(two_strain_database17_98_locus)
#[1] 7538
#Locus by cross
locus_by_cross <- two_strain_database17_98_locus %>% group_by(strainAB, locus) %>%
summarise(
number_samples = length(cross_sample),
)
#NA (neuraminidase) reads
View(locus_by_cross[grep("NA", locus_by_cross$locus), ])
#HA (hemagglutinin) reads
View(locus_by_cross[grep("HA", locus_by_cross$locus), ])
## 2.1 NA Segment ----
#Lets check how many samples have any NA segments
nrow(two_strain_database17_98_locus[grep("NA", two_strain_database17_98_locus$locus), ])
#[1] 1300
#Put them in a data frame
na <- two_strain_database17_98_locus[grep("NA", two_strain_database17_98_locus$locus), ]
#Now check how many NA's called per sample/cross
na_by_sample <- group_by(na, cross_sample) %>%
summarise(
count = n()
)
#How many samples have 2 NA loci called?
nrow(subset(na_by_sample, count == 2))
#[1] 347
#Join that information into the na data frame
na <- left_join(na, na_by_sample)
#How many have 2 loci called
nrow(subset(na, count == 2))
#[1] 694
#Remember this is twice the number above, because here not looking by sample
#Let's subset to these only, the others we can leave untouched
na_two <- subset(na, count == 2)
#First, as a sanity check there should be no CA09_SI86 samples or HK68_PAN99 because those are same-subtype
filter(na_two, strainAB == "CA09_SI86" | strainAB == "HK68_PAN99")
#<0 rows> (or 0-length row.names)
#Looks good.
#Now down the line
#Using a single rule for all NA1/NA3 decisions based on manual inspection
na_two_wider <- na_two %>% group_by(cross_sample, locus) %>%
summarise(
majority_assigned_reads = sum(majority_assigned_reads),
)
#Check pivot with View() before implementing
#View(pivot_wider(na_two_wider, names_from = locus, values_from = majority_assigned_reads))
#Implement
na_two_wider <- pivot_wider(na_two_wider, names_from = locus, values_from = majority_assigned_reads)
#Number of rows
nrow(na_two_wider)
#347
#So one row per sample
#Number of reads per sample with NA1c loci and NA3a loci
ggplot(na_two_wider, aes(x = NA1c, y = NA3a)) + geom_point()
#So our strategy is to add a "remove" data frame, which will have a 'cross_sample_locus' column,
# that will then be used as a key to remove rows from the larger database
#Our starting rule is going to be NA3a's < 100 we want to give to NA1c, and NA1c's <3 we want to give to NA3a
#But first, let's just agree that any where NA1c is equal or majority, should be NA1c (NA1c is longer locus and is more rare than NA3a which is short and heavily favored by Illumina)
#View(subset(na_two_wider, NA3a < NA1c))
nrow(subset(na_two_wider, NA3a <= NA1c))
#141
#So go ahead and mark the NA3a's in those samples as remove
#Create a data frame that I can then join to na_two_wider
cross_sample <- subset(na_two_wider, NA3a <= NA1c)$cross_sample
cross_sample_locus <- paste(subset(na_two_wider, NA3a <= NA1c)$cross_sample, "NA3a", sep = "_")
#Make temp remove data frame
remove_na <- data.frame(cross_sample,cross_sample_locus)
nrow(remove_na)
#141
#Now remove those from the na_two_wider data frame so I can whittle things down
#View(anti_join(na_two_wider, remove_na)) #206 looks good!
na_two_wider <- anti_join(na_two_wider, remove_na)
#Now among the ones where NA3a is greater than NA1c, let's assign remaining:
#We will recursively add to the remove DF, as we go through each condition
#First, remove those that do not meet criteria to assign to NA1c and add to
cross_sample <- c(cross_sample, subset(na_two_wider, NA1c < 3)$cross_sample)
cross_sample_locus <- c(cross_sample_locus, paste(subset(na_two_wider, NA1c < 3)$cross_sample, "NA1c", sep = "_"))
remove_na <- data.frame(cross_sample,cross_sample_locus)
nrow(remove_na)
#201
#Now again remove those from the na_two_wider data frame so I can whittle things down
#View(anti_join(na_two_wider, remove_na)) #146, moving down the line
na_two_wider <- anti_join(na_two_wider, remove_na)
#Second, now for those sample that meet criteria to assign to NA3a, I need to add NA1c to the remove for those loci
#View(subset(na_two_wider, NA3a > 100))
nrow(subset(na_two_wider, NA3a > 100))
#141
#Add recursively
cross_sample <- c(cross_sample, subset(na_two_wider, NA3a > 100)$cross_sample)
cross_sample_locus <- c(cross_sample_locus, paste(subset(na_two_wider, NA3a > 100)$cross_sample, "NA1c", sep = "_"))
#Build remove_na df again
remove_na <- data.frame(cross_sample,cross_sample_locus)
nrow(remove_na)
#342
#Now for prob final time remove those from the na_two_wider data frame so I can whittle things down
#View(anti_join(na_two_wider, remove_na)) #5, just these left!
na_two_wider <- anti_join(na_two_wider, remove_na)
#Finally, these remaining 5 ones are toss ups - I will simply give to NA3a, so need to label them as NA1c
#No subsetting needed
cross_sample <- c(cross_sample, na_two_wider$cross_sample)
cross_sample_locus <- c(cross_sample_locus, paste(na_two_wider$cross_sample, "NA1c", sep = "_"))
#Build remove_na df for a final time
remove_na <- data.frame(cross_sample,cross_sample_locus)
nrow(remove_na)
#347 goal!!!
#Now let's do a quick sanity check, visually with view
View(
na_two %>% group_by(cross_sample, locus) %>%
summarise(
majority_assigned_reads = sum(majority_assigned_reads),
) %>% pivot_wider(names_from = locus, values_from = majority_assigned_reads) %>% left_join(remove_na)
)
#Gosh I love the tidyverse!
#Looks good enough especially at the extremes
#Now let's remove these offending rows from the big data frame!
#First check what are big data frame looks like before modifying
nrow(two_strain_database17_98_locus)
#7538
#And recall how many of the samples had duplicate NA calls
nrow(subset(na_by_sample, count == 2))
#[1] 347
#Recall that is the number of samples, so I had 347*2 = 694 rows in the data frame assoc w/ 2 NA's,
# so want to get that down to this amount:
nrow(two_strain_database17_98_locus) - nrow(subset(na_by_sample, count == 2))
#7191
#Prepare to actually remove rows
#View(two_strain_database17_98_locus[!two_strain_database17_98_locus$cross_sample_locus %in% remove_na$cross_sample_locus, ])
#Beautiful, isn't it?
#Actually change data frame
two_strain_database17_98_locus <- two_strain_database17_98_locus[!two_strain_database17_98_locus$cross_sample_locus %in% remove_na$cross_sample_locus, ]
#Now check
filter(two_strain_database17_98_locus, locus == "NA3a" | locus == "NA1c") %>%
group_by(cross_sample) %>%
summarise(
count = n()
)
#All have only 1 NA locus
#Now rename all the NA loci
two_strain_database17_98_locus$locus[grep("NA", two_strain_database17_98_locus$locus)] <- "NA"
#DONE!!!!!
##### End NA processing
## 2.2 HA Segment ----
#Lets check how many samples have any HA segments
nrow(two_strain_database17_98_locus[grep("HA", two_strain_database17_98_locus$locus), ])
#[1] 1124
#Put them in a data frame
ha <- two_strain_database17_98_locus[grep("HA", two_strain_database17_98_locus$locus), ]
#Now check how many HA's called per sample/cross
ha_by_sample <- group_by(ha, cross_sample) %>%
summarise(
count = n()
)
#How many have 2 HA loci called?
nrow(subset(ha_by_sample, count == 2))
#[1] 235
#Join that information into the ha data frame
ha <- left_join(ha, ha_by_sample)
#How many have 2 loci called
nrow(subset(ha, count == 2))
#[1] 470
#Remember this is twice the number above, because here not looking by sample
#Let's subset to these only, the others we can leave untouched
ha_two <- subset(ha, count == 2)
#First, same as NA, as a sanity check there should be no CA09_SI86 samples or HK68_PAN99 because those are same-subtype
filter(ha_two, strainAB == "CA09_SI86" | strainAB == "HK68_PAN99")
#<0 rows> (or 0-length row.names)
#Looks good.
#Now down the line
ha_two_wider <- ha_two %>% group_by(cross_sample, locus) %>%
summarise(
majority_assigned_reads = sum(majority_assigned_reads),
)
#View
#View(pivot_wider(ha_two_wider, names_from = locus, values_from = majority_assigned_reads))
#Implement
ha_two_wider <- pivot_wider(ha_two_wider, names_from = locus, values_from = majority_assigned_reads)
#Number of rows
nrow(ha_two_wider)
#235
#So one row per sample
#Number of reads per sample with NA1c loci and NA3a loci
ggplot(ha_two_wider, aes(x = HA1b, y = HA3a)) + geom_point()
#So our strategy is to add a "remove" data frame, which will have a 'cross_sample_locus' column,
# that will then be used as a key to remove rows from the larger database
#But first, let's just agree that any sample where HA1b is equal or majority, the HA3a locus should be marked for removal (HA3a is a small locus heavily favored by Illumina sequencing, and yes, David did warn me about this when designing my primers, but there's a lot of variability!)
#View(subset(ha_two_wider, HA3a <= HA1b))
nrow(subset(ha_two_wider, HA3a <= HA1b))
#36
#So go ahead and mark the HA3a's in those samples as remove
#Create a data frame that I can then join to ha_two_wider
cross_sample <- subset(ha_two_wider, HA3a <= HA1b)$cross_sample
cross_sample_locus <- paste(subset(ha_two_wider, HA3a <= HA1b)$cross_sample, "HA3a", sep = "_")
#Make temp remove data frame
remove_ha <- data.frame(cross_sample,cross_sample_locus)
nrow(remove_ha)
#36
#Now remove those from the ha_two_wider data frame so I can whittle things down
#View(anti_join(ha_two_wider, remove_ha)) #199 made a small dent
ha_two_wider <- anti_join(ha_two_wider, remove_ha)
#Now among the ones where HA3a is greater than HA1b, let's assign remaining:
#We will recursively add to the remove DF, as we go through each condition, may circle back to this later
#Controls show that HA1b locus can't really be amplified in H3 strains. But very conservatively I will exclude
# singleton HA1b's across the board
#View(subset(ha_two_wider, HA1b == 1))
nrow(subset(ha_two_wider, HA1b == 1))
#47
#First, remove singleton HA1b's
cross_sample <- c(cross_sample, subset(ha_two_wider, HA1b == 1)$cross_sample)
cross_sample_locus <- c(cross_sample_locus, paste(subset(ha_two_wider, HA1b == 1)$cross_sample, "HA1b", sep = "_"))
remove_ha <- data.frame(cross_sample,cross_sample_locus)
nrow(remove_ha)
#83
#Now again remove those from the ha_two_wider data frame so I can whittle things down
#View(anti_join(ha_two_wider, remove_ha)) #152, making progress now
ha_two_wider <- anti_join(ha_two_wider, remove_ha)
#Second, will accept all HA1b's >2, when HA3a's are < 1000. So remove HA3a's under 1000. Again HA1b is a large amplicon that is disfavored in Illumina sequencing and doesn't really amplify from H3 samples
#View(subset(ha_two_wider, HA3a < 1000))
nrow(subset(ha_two_wider, HA3a < 1000))
#127
#Add recursively
cross_sample <- c(cross_sample, subset(ha_two_wider, HA3a < 1000)$cross_sample)
cross_sample_locus <- c(cross_sample_locus, paste(subset(ha_two_wider, HA3a < 1000)$cross_sample, "HA3a", sep = "_"))
#Build remove_ha df again
remove_ha <- data.frame(cross_sample,cross_sample_locus)
nrow(remove_ha)
#210
#Now for prob final time remove those from the ha_two_wider data frame so I can whittle things down
#View(anti_join(ha_two_wider, remove_ha)) #25, just these left!
ha_two_wider <- anti_join(ha_two_wider, remove_ha)
#These are pretty clear HA3a's just a few potential toss ups close to 1000 HA3a reads but giving to HA3a
#No subsetting needed
cross_sample <- c(cross_sample, ha_two_wider$cross_sample)
cross_sample_locus <- c(cross_sample_locus, paste(ha_two_wider$cross_sample, "HA1b", sep = "_"))
#Build remove_ha df for a final time
remove_ha <- data.frame(cross_sample,cross_sample_locus)
nrow(remove_ha)
#235 goal!!!
#Now let's do a quick sanity check, visually with view
View(
ha_two %>% group_by(cross_sample, locus) %>%
summarise(
majority_assigned_reads = sum(majority_assigned_reads),
) %>% pivot_wider(names_from = locus, values_from = majority_assigned_reads) %>% left_join(remove_ha)
)
#Gosh I love the tidyverse!
#Looks good enough especially at the extremes
#Now let's remove these offending rows from the big data frame!
#First check what are big data frame looks like before modifying
nrow(two_strain_database17_98_locus)
#7191
#And recall how many of the samples had duplicate NA calls
nrow(subset(ha_by_sample, count == 2))
#[1] 235
#Recall that is the number of samples, so I had 235*2 = 470 rows in the data frame assoc w/ 2 HA's,
# so want to get that down to this amount:
nrow(two_strain_database17_98_locus) - nrow(subset(ha_by_sample, count == 2))
#6956
#Prepare to actually remove rows
#View(two_strain_database17_98_locus[!two_strain_database17_98_locus$cross_sample_locus %in% remove_ha$cross_sample_locus, ])
#Beautiful, isn't it?
#Actually change data frame
two_strain_database17_98_locus <- two_strain_database17_98_locus[!two_strain_database17_98_locus$cross_sample_locus %in% remove_ha$cross_sample_locus, ]
#Now check
filter(two_strain_database17_98_locus, locus == "HA3a" | locus == "HA1b") %>%
group_by(cross_sample) %>%
summarise(
count = n()
)
#All have only 1 HA locus
#Now rename all the HA loci
two_strain_database17_98_locus$locus[grep("HA", two_strain_database17_98_locus$locus)] <- "HA"
##### End HA processing
# 3. Quality measures of strain assignments and calculation of reassortment frequencies
#Checking how good the assignments are
mean(two_strain_database17_98_locus$proportion_assigned)
#[1] 0.9416985
sd(two_strain_database17_98_locus$proportion_assigned)
#[1] 0.1118182
#Pretty good overall
#How many sample/locus combinations are over a certain proportion
nrow(subset(two_strain_database17_98_locus, proportion_assigned > .80)) / nrow(two_strain_database17_98_locus)
#0.8858539
nrow(subset(two_strain_database17_98_locus, proportion_assigned > .75)) / nrow(two_strain_database17_98_locus)
#0.9097182
nrow(subset(two_strain_database17_98_locus, proportion_assigned > .70)) / nrow(two_strain_database17_98_locus)
#0.93387
nrow(subset(two_strain_database17_98_locus, proportion_assigned > .65)) / nrow(two_strain_database17_98_locus)
#0.9544278
#Histogram of the proportion of reads assigned to one strain (note this is expected to be close to 1, because plaques should be clonal but sequencing errors particularly when read count is small can lead to lower proportions)
ggplot(two_strain_database17_98_locus, aes(x = proportion_assigned)) + geom_histogram()
#First reassortment plot after correcting for antigenic segments
ggplot(two_strain_database17_98_locus, aes(x = locus, y = sample)) + geom_point(aes(col=majority_strain, alpha=proportion_assigned), shape=15, size = 6) + theme(axis.text.x = element_text(angle=90)) + facet_wrap(~ cross)
#Rename data frame
controls_df <- two_strain_database17_98_locus
#First need to get the number of typed loci for each sample
controls_df_loci_numbers <- controls_df %>%
group_by(cross_sample) %>%
summarise(
total_segments = length(locus),
max_majority_strain = max(table(majority_strain))
)
controls_df <- right_join(controls_df, controls_df_loci_numbers)
#Once we have numbers, sort by parentals and limit to complete genotypes and plot
ggplot(subset(controls_df, total_segments > 7), aes(x = locus, y = reorder(sample, max_majority_strain))) + geom_point(aes(col=majority_strain, alpha=proportion_assigned), shape=15, size = 6) + theme(axis.text.x = element_text(angle=90)) + facet_wrap(~ cross, scales = "free_y")
#Once we have numbers, sort by parentals and limit to complete genotypes and pairwise grid and plot
ggplot(subset(controls_df, total_segments > 7), aes(x = locus, y = sample)) + geom_point(aes(col=majority_strain, alpha=proportion_assigned), shape=15, size = 6) + theme(axis.text.x = element_text(angle=90), axis.text.y = element_blank()) + facet_grid(strainA ~ strainB, scales = "free_y")
#Pairwise grid
ggplot(controls_df, aes(x = locus, y = sample)) + geom_point(aes(col=majority_strain, alpha=proportion_assigned), shape=15, size = 6) + theme(axis.text.x = element_text(angle=90), axis.text.y = element_blank()) + facet_grid(strainA ~ strainB, scales = "free_y")
#Not ordered by parentals/reassortants
ggplot(controls_df, aes(x = locus, y = sample)) + geom_point(aes(col=majority_strain, alpha=proportion_assigned), shape=15, size = 6) + theme(axis.text.x = element_text(angle=90)) + facet_wrap(~ cross)
#Not ordered
ggplot(controls_df, aes(x = locus, y = sample)) + geom_point(aes(col=majority_strain, alpha=proportion_assigned), shape=15, size = 6) + theme(axis.text.x = element_text(angle=90)) + theme(axis.text.y = element_blank()) + facet_grid(strainA ~ strainB, scales = "free_y")
#Not ordered at least 7 segments
ggplot(subset(controls_df, total_segments > 6), aes(x = locus, y = sample)) + geom_point(aes(col=majority_strain, alpha=proportion_assigned), shape=15, size = 6) + theme(axis.text.x = element_text(angle=90)) + theme(axis.text.y = element_blank()) + facet_grid(strainA ~ strainB, scales = "free_y")
#Figure 2A - Not ordered, all samples genotypes, order segments in proper influenza order
ggplot(controls_df, aes(x = factor(locus, level = c('PB2f', 'PB1c', 'PAc', 'HA', 'NPd', 'NA', 'Mg', 'NS1d')), y = sample)) + geom_point(aes(col=majority_strain), shape=15, size = 6) + theme(axis.text.x = element_text(angle=90)) + xlab("Segment") + ylab("Plaque Isolate") + theme(axis.text.y = element_blank()) + facet_grid(strainA ~ strainB, scales = "free_y")
#Let's spruce up re: Elodie comments, this is the final figure 2A
ggplot(controls_df, aes(x = factor(locus, level = c('PB2f', 'PB1c', 'PAc', 'HA', 'NPd', 'NA', 'Mg', 'NS1d')), y = sample)) + geom_point(aes(col=majority_strain), shape=15, size = 6) + xlab("Segment") + ylab("Plaque Isolate") + facet_grid(strainA ~ strainB, scales = "free_y") + theme_tufte() + theme(text = element_text(size = 20, family="Helvetica")) + theme(axis.text.x = element_text(angle=90, size = 17)) + theme(axis.text.y = element_blank()) + theme(strip.text = element_text(face = "bold")) + theme(legend.position = "none")
ggsave("outputs/figure2A.pdf", width = 8, height = 11)
#Now let's generate some numbers!
parentals <- controls_df %>%
group_by(cross_sample, cross, sample) %>%
summarise(
parental = max_majority_strain == total_segments,
#total_segments == max_majority_strain
)
parentals <- distinct(parentals)
#This table gives a quick look at the number of samples per cross that are parentals (i.e. not reassortant)
table(parentals$cross, parentals$parental)
#Group by cross_sample and calculate number of parents for each
controls_df_number_parents <- controls_df %>%
group_by(cross, cross_sample) %>%
summarise(
number_parents = length(unique(majority_strain)),
#constellation = paste(majority_strain_locus, sep = ","),
max_majority_strain = max(table(majority_strain))
)
nrow(subset(controls_df_number_parents, number_parents == 2))
#470
nrow(controls_df_number_parents)
#[1] 960
#Make a reasortant column
controls_df_number_parents <- mutate(controls_df_number_parents, reassortant = ifelse(number_parents > 1, yes = 1, no = 0))
#Make a data frame that has per-cross (i.e. experimental coinfection) statistics
cross_stats <- controls_df_number_parents %>%
group_by(cross) %>%
summarise(
clones = length(number_parents),
reassortants = sum(reassortant)
#constellation = paste(majority_strain_locus, sep = ","),
#max_majority_strain = max(table(majority_strain))
)
#Add proportion reassortant to data frame that has per-cross stats
cross_stats <- mutate(cross_stats, prop_reassortant = reassortants / clones)
#Now we want to check if we include only samples that have complete genotypes, do our estimates change significantly
# Make stat test looking only at complete genotypes
#Complete Genotypes Only
#Group by cross_sample and calculate number of parents for each
controls_df_number_parents_complete <- subset(controls_df, total_segments > 7) %>%
group_by(cross, cross_sample) %>%
summarise(
number_parents = length(unique(majority_strain)),
#constellation = paste(majority_strain_locus, sep = ","),
#max_majority_strain = max(table(majority_strain))
)
nrow(subset(controls_df_number_parents_complete, number_parents == 2))
#229
nrow(controls_df_number_parents_complete)
#[1] 417
#Make a reasortant column
controls_df_number_parents_complete <- mutate(controls_df_number_parents_complete, reassortant = ifelse(number_parents > 1, yes = 1, no = 0))
cross_stats_complete <- controls_df_number_parents_complete %>%
group_by(cross) %>%
summarise(
clones = length(number_parents),
reassortants = sum(reassortant)
#constellation = paste(majority_strain_locus, sep = ","),
#max_majority_strain = max(table(majority_strain))
)
#Add proportion reassortant to data frame
cross_stats_complete <- mutate(cross_stats_complete, prop_reassortant = reassortants / clones)
# A tibble: 10 × 4
#cross clones reassortants prop_reassortant
#<chr> <int> <dbl> <dbl>
#1 cross1 11 1 0.0909
#2 cross11 11 8 0.727
#3 cross14 88 64 0.727
#4 cross15 84 49 0.583
#5 cross18 32 32 1
#6 cross19 8 3 0.375
#7 cross2 12 0 0
#8 cross3 66 49 0.742
#9 cross7 82 10 0.122
#10 cross8 23 13 0.565
# Some of the reassortment estimates change, mostly in those where the sample size is substantially reduced
#Note that we get a cross that is 100% reassortant! Higher and lower estimates at the extremes
#Make a stats test that will check if the proportions with complete and incomplete genotypes differ
#Testing
prop.test(c(cross_stats_complete$reassortants[1], cross_stats$reassortants[1]), c(cross_stats_complete$clones[1], cross_stats$clones[1]))
#X-squared = 0.046507, df = 1, p-value = 0.8293
prop.test(c(cross_stats_complete$reassortants[2], cross_stats$reassortants[2]), c(cross_stats_complete$clones[2], cross_stats$clones[2]))
#X-squared = 2.2667, df = 1, p-value = 0.1322
prop.test(c(cross_stats_complete$reassortants[3], cross_stats$reassortants[3]), c(cross_stats_complete$clones[3], cross_stats$clones[3]))
#X-squared = 0.00047413, df = 1, p-value = 0.9826
prop.test(c(cross_stats_complete$reassortants[4], cross_stats$reassortants[4]), c(cross_stats_complete$clones[4], cross_stats$clones[4]))
#X-squared = 0.075335, df = 1, p-value = 0.7837
prop.test(c(cross_stats_complete$reassortants[5], cross_stats$reassortants[5]), c(cross_stats_complete$clones[5], cross_stats$clones[5]))
#X-squared = 1.2593, df = 1, p-value = 0.2618
prop.test(c(cross_stats_complete$reassortants[6], cross_stats$reassortants[6]), c(cross_stats_complete$clones[6], cross_stats$clones[6]))
#X-squared = 1.1587, df = 1, p-value = 0.2817
prop.test(c(cross_stats_complete$reassortants[7], cross_stats$reassortants[7]), c(cross_stats_complete$clones[7], cross_stats$clones[7]))
#X-squared = 1.9557e-30, df = 1, p-value = 1
prop.test(c(cross_stats_complete$reassortants[8], cross_stats$reassortants[8]), c(cross_stats_complete$clones[8], cross_stats$clones[8]))
#X-squared = 0.14773, df = 1, p-value = 0.7007
prop.test(c(cross_stats_complete$reassortants[9], cross_stats$reassortants[9]), c(cross_stats_complete$clones[9], cross_stats$clones[9]))
#X-squared = 0.001833, df = 1, p-value = 0.9658
prop.test(c(cross_stats_complete$reassortants[10], cross_stats$reassortants[10]), c(cross_stats_complete$clones[10], cross_stats$clones[10]))
#X-squared = 1.1108, df = 1, p-value = 0.2919
#All p-values greater than 0.1322, many close to 1
#Going forward with allowing incomplete genotypes, because this is conservative with respect to reassortemnt, because missing segments decrease the chance of detecting a reassortant
ggplot(controls_df, aes(x = locus, y = reorder(sample, max_majority_strain))) + geom_point(aes(col=majority_strain, alpha=proportion_assigned), shape=15, size = 6) + theme(axis.text.x = element_text(angle=90)) + facet_wrap(~ cross)
#Plot reasortment proportions
ggplot(cross_stats, aes(x = cross, y = prop_reassortant)) + geom_col()
#Same but reorder by height
ggplot(cross_stats, aes(x = reorder(cross, prop_reassortant), y = prop_reassortant)) + geom_col()
#Same but reorder by height with line for 40% reassortment but then also for the theoretical free reassortment
ggplot(cross_stats, aes(x = reorder(cross, prop_reassortant), y = prop_reassortant)) + geom_col() + geom_hline(yintercept = 0.40, linetype = 2, color = "grey", alpha = 0.75) + geom_hline(yintercept = 0.9921875, linetype = 2, color = "red", alpha = 0.75)
#Add strainAB information to cross_stats DF
cross_stats <- left_join(cross_stats, cross_data_runA, by = "cross")
#Same but reorder by height with line for 40% reassortment but then also for the theoretical free reassortment and add strain info on x axis
ggplot(cross_stats, aes(x = reorder(strainAB, prop_reassortant), y = prop_reassortant)) + geom_col() + ylab("Proportion of Reassortant Plaque Isolates") + xlab("Strains in Experimental Coinfection") + geom_hline(yintercept = 0.40, linetype = 2, color = "grey", alpha = 0.75) + geom_hline(yintercept = 0.9921875, linetype = 2, color = "red", alpha = 0.75)
#Update for better publication quality. This is the final Figure 2B
ggplot(cross_stats, aes(x = reorder(strainAB, prop_reassortant), y = prop_reassortant)) + geom_col() + ylab("Proportion of Reassortant Plaque Isolates") + xlab("Strains in Experimental Coinfection") + geom_hline(yintercept = 0.40, linetype = 2, color = "grey", alpha = 0.75) + geom_hline(yintercept = 0.9921875, linetype = 2, color = "red", alpha = 0.75) + theme_tufte() + theme(text = element_text(size = 20, family="Helvetica")) + theme(axis.text.x = element_text(size = 12)) + theme(legend.position = "none") + theme(panel.grid.major.y = element_line(color = "lightgray",size = 0.5))
ggsave("outputs/figure2B.pdf", width = 12, height = 8.5)
#Now more stats across all crosses
mean(cross_stats$prop_reassortant)
#[1] 0.4895833
sd(cross_stats$prop_reassortant)
#[1] 0.3007834
min(cross_stats$prop_reassortant)
#[1] 0.04166667
max(cross_stats$prop_reassortant)
#[1] 0.9270833
#Let's calculate some stats on the expected vs observed number of reasortants
#Reassortment as success. Perfect reassortment expectation is calculated as 254/256, i.e. 2^8 = 256 unique combinations, two of those subtracted becausecause they are parentals
#So first let's do an example "manually" with CA09_SI86/cross18
binom.test(c(89, 7), p = 254/256)
#Exact binomial test
#data: c(89, 7)
#number of successes = 89, number of trials = 96, p-value = 1.153e-05
#alternative hypothesis: true probability of success is not equal to 0.9921875
#95 percent confidence interval:
# 0.8555203 0.9701822
#sample estimates:
# probability of success
#0.9270833
#In highest reassortment experimental coinfection we are numerically close to "perfect" or random reassortment (0.9921875 versus 0.9270833),
# but is was statistically significantly lower. So random reassortment is rejected
controls_df <- right_join(controls_df, cross_stats)
#Does strain identity affect the proportion of reassortants?
ggplot(controls_df, aes(x = cross, y = prop_reassortant)) + geom_point(aes(color = strainA))
#Does strain identity affect the proportion of reassortants? Heatmap attempt
ggplot(controls_df, aes(x = strainA, y = strainB, fill= prop_reassortant)) + geom_tile()
#Heatmap! This is a useful way to visualize reassortment by strain, but perhaps not as intutive as the final figure (see below)
ggplot(controls_df, aes(x = strainA, y = strainB, fill= prop_reassortant)) + geom_tile() + geom_text(aes(label = round(prop_reassortant, 2))) + theme(legend.position = "none") + scale_fill_gradient(low = "white", high = "red")
#Let's calculate the averages and SD's of each strain
#SI86
mean(subset(cross_stats, strainA == "SI86" | strainB == "SI86", select = prop_reassortant, drop = T))
#[1] 0.6484375
sd(subset(cross_stats, strainA == "SI86" | strainB == "SI86", select = prop_reassortant, drop = T))
#0.2102989
#TX12
mean(subset(cross_stats, strainA == "TX12" | strainB == "TX12", select = prop_reassortant, drop = T))
#[1] 0.4192708
sd(subset(cross_stats, strainA == "TX12" | strainB == "TX12", select = prop_reassortant, drop = T))
#0.3128904
#PAN99
mean(subset(cross_stats, strainA == "PAN99" | strainB == "PAN99", select = prop_reassortant, drop = T))
#[1] 0.4921875
sd(subset(cross_stats, strainA == "PAN99" | strainB == "PAN99", select = prop_reassortant, drop = T))
#0.2761727
#HK68
mean(subset(cross_stats, strainA == "HK68" | strainB == "HK68", select = prop_reassortant, drop = T))
la#[1] 0.3515625
sd(subset(cross_stats, strainA == "HK68" | strainB == "HK68", select = prop_reassortant, drop = T))
#0.3261348
#CA09
mean(subset(cross_stats, strainA == "CA09" | strainB == "CA09", select = prop_reassortant, drop = T))
#[1] 0.5364583
sd(subset(cross_stats, strainA == "CA09" | strainB == "CA09", select = prop_reassortant, drop = T))
#0.3866347
#Alternative to heatmap, potential new Figure 3A
#This is just a spreadsheet that lists reassortment frequencies for each strain (note they are "double counted")
cross_stats_manual <- read.csv("cross_stats_manual.csv")
View(cross_stats_manual)
ggplot(cross_stats_manual, aes(x = strain, y = prop_reassortant)) + geom_point()
#This is Figure 4A, and nicely shows how some strains tend to produce higher reassortment frequencies (see SI86 verus CA09 for an extreme visual example) Stats test follows below
ggplot(cross_stats_manual, aes(x = strain, y = prop_reassortant)) + geom_boxplot() + geom_point(aes(colour = as.factor(prop_reassortant), size = 8)) + scale_color_brewer(palette = 'Set3') + xlab("Strain") + ylab("Proportion of Reassortant Plaque Isolates") + theme(legend.position = "none") + theme_tufte() + theme(text = element_text(size = 20, family="Helvetica")) + theme(axis.text.x = element_text(size = 12)) + theme(legend.position = "none") + theme(panel.grid.major.y = element_line(color = "lightgray",size = 0.5))
#START HERE NOV 16, 2022
#Let's do a quick proportion test on the reassortment data
#This test looks at each
prop.test(cross_stats$reassortants, cross_stats$clones)
# 10-sample test for equality of proportions without continuity correction
#data: cross_stats$reassortants out of cross_stats$clones
#X-squared = 312.8, df = 9, p-value < 2.2e-16
#alternative hypothesis: two.sided
#sample estimates:
# prop 1 prop 2 prop 3 prop 4 prop 5 prop 6 prop 7 prop 8 prop 9 prop 10
#0.16666667 0.43750000 0.73958333 0.61458333 0.92708333 0.63541667 0.04166667 0.78125000 0.13541667 0.41666667
pairwise.prop.test(cross_stats$reassortants, cross_stats$clones)
#Pairwise comparisons using Pairwise comparison of proportions
#data: cross_stats$reassortants out of cross_stats$clones
# 1 2 3 4 5 6 7 8 9
#2 0.00153 - - - - - - - -
#3 1.7e-13 0.00076 - - - - - - -
#4 1.5e-08 0.18673 0.53732 - - - - - -
#5 < 2e-16 3.3e-11 0.01591 1.6e-05 - - - - -
#6 2.8e-09 0.11943 0.80562 1.00000 5.6e-05 - - - -
#7 0.11943 1.1e-08 < 2e-16 3.9e-15 < 2e-16 5.2e-16 - - -
#8 2.0e-15 5.3e-05 1.00000 0.18392 0.10997 0.31207 < 2e-16 - -
#9 1.00000 0.00017 4.0e-15 6.1e-10 < 2e-16 1.0e-10 0.31207 < 2e-16 -
#10 0.00443 1.00000 0.00024 0.11943 5.5e-12 0.05766 5.0e-08 1.4e-05 0.00054
#P value adjustment method: holm
#Only 1x7, 1X9, 2x4, 2x6, 2x10, 3x4, 3x6, 3x8, 4x6, 4x8, 4x10, 5x8, 6x8, 6x10, 7x9 = 15 out of 44
select(cross_stats, c(prop_reassortant, strainAB))
#1 0.167 HK68_TX12
#2 0.438 CA09_PAN99
#3 0.740 CA09_TX12
#4 0.615 PAN99_SI86
#5 0.927 CA09_SI86
#6 0.635 SI86_TX12
#7 0.0417 CA09_HK68
#8 0.781 HK68_PAN99
#9 0.135 PAN99_TX12
#10 0.417 HK68_SI86
#Bring up ANOVA analysis, buried below
#Import matrix with crosses and strains for an ANOVA
cross_data_runA_matrix <- read.csv("~/Dropbox/mixtup/Documentos/ucdavis/papers/influenza_GbBSeq_human/influenza_GbBSeq/data/cross_data_runA_matrix.csv")
cross_matrix <- as.matrix(cross_data_runA_matrix[2:6])
#Now add strain information to cross_stats
cross_stats <- right_join(cross_stats, unique(subset(controls_df, select = c(cross, strainA, strainB))))
table(cross_stats$strainA, cross_stats$prop_reassortant)
model5 = lm(prop_reassortant ~ cross_matrix-1, cross_stats)
#Adding -1 to get all coefficients (assuming intercept doesn't make sense because there is no reassortment without strains)
summary(model5)
#Call:
# lm(formula = prop_reassortant ~ cross_matrix - 1, data = cross_stats)
#Residuals:
# 1 2 3 4 5 6 7 8 9 10
#-0.18750 0.07986 -0.12500 0.09375 0.23264 0.11111 0.02778 -0.07986 -0.21875 0.06597
#Coefficients:
# Estimate Std. Error t value Pr(>|t|)
#cross_matrixCA09 0.008681 0.105766 0.082 0.93777
#cross_matrixHK68 0.348958 0.105766 3.299 0.02149 *
# cross_matrixPAN99 0.515625 0.105766 4.875 0.00457 **
# cross_matrixSI86 0.345486 0.105766 3.267 0.02228 *
# cross_matrixTX12 0.005208 0.105766 0.049 0.96263
#---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#Residual standard error: 0.1958 on 5 degrees of freedom
#Multiple R-squared: 0.9403, Adjusted R-squared: 0.8806
#F-statistic: 15.75 on 5 and 5 DF, p-value: 0.004435
#ANOVA Table
#Bring plots together
#Requires package GridExtra
summary(model5)
as.data.frame(exp(coef(model5))) #exp doesn't make sense here
as.data.frame(coef(model5))
#subset(summary(model5)$coef, "Pr(>|z|)" < 0.05)
model5_coef <- data.frame(summary(model5)$coef)
colnames(model5_coef) <- c("Coef", "Std Error", "t value", "p")
model5_coef
model5_coef_rtable <- tableGrob(round(model5_coef, digits=3))
grid.newpage()
grid.draw(model5_coef_rtable)
grid.arrange(model5_coef)
#Now lets look at H1N1 vs H3N2
#Plot by subtype
ggplot(controls_df, aes(x = subtype, y = prop_reassortant)) + geom_point() + geom_boxplot(alpha = 0.25)
#Yes or no? This is Figure 4
ggplot(controls_df, aes(x = same_subtype, y = prop_reassortant)) + xlab("Are coinfecting strains the same subtype?") + ylab("Proportion of Reassortant Plaque Isolates") + geom_point(aes(color=cross, size = 7)) + geom_boxplot(alpha = 0.25) + theme(legend.position = "none") + theme_tufte() + theme(text = element_text(size = 20, family="Helvetica")) + theme(axis.text.x = element_text(size = 12)) + theme(legend.position = "none") + theme(panel.grid.major.y = element_line(color = "lightgray",size = 0.5))
summary.lm(aov(prop_reassortant ~ subtype, controls_df))
#Call:
#aov(formula = prop_reassortant ~ subtype, data = controls_df)
#Residuals:
# Min 1Q Median 3Q Max
#-0.4485 -0.1984 0.0000 0.1452 0.4161
#Coefficients:
# Estimate Std. Error t value Pr(>|t|)
#(Intercept) 0.927083 0.009296 99.73 <2e-16 ***
# subtypeH3N2 -0.561963 0.010641 -52.81 <2e-16 ***
# subtypeH3N2/H1NI -0.436892 0.010020 -43.60 <2e-16 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#Residual standard error: 0.2404 on 6953 degrees of freedom
#Multiple R-squared: 0.2864, Adjusted R-squared: 0.2862
#F-statistic: 1395 on 2 and 6953 DF, p-value: < 2.2e-16
#This should be a t-test
summary.lm(aov(prop_reassortant ~ same_subtype, controls_df))
#Call:
#aov(formula = prop_reassortant ~ same_subtype, data = controls_df)
#Residuals:
# Min 1Q Median 3Q Max
#-0.4485 -0.3316 0.1244 0.2494 0.4288
#Coefficients:
# Estimate Std. Error t value Pr(>|t|)
#(Intercept) 0.490191 0.004427 110.73 <2e-16 ***
# same_subtypeY 0.008057 0.006948 1.16 0.246
#---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#Residual standard error: 0.2846 on 6954 degrees of freedom
#Multiple R-squared: 0.0001933, Adjusted R-squared: 4.956e-05
#F-statistic: 1.345 on 1 and 6954 DF, p-value: 0.2462
#Now t-test
intrasubtype <- as.vector(unique(subset(controls_df, same_subtype == "Y", select = "prop_reassortant")))
intersubtype <- as.vector(unique(subset(controls_df, same_subtype == "N", select = "prop_reassortant")))
t.test(intrasubtype, intersubtype)
#Welch Two Sample t-test
#data: intrasubtype and intersubtype
#t = 0.094822, df = 4.4789, p-value = 0.9285
#alternative hypothesis: true difference in means is not equal to 0
#95 percent confidence interval:
# -0.5878083 0.6312111
#sample estimates:
# mean of x mean of y
#0.5026042 0.480902
mean(intrasubtype$prop_reassortant)
#[1] 0.5026042
sd(intrasubtype$prop_reassortant)
#[1] 0.4104902
mean(intersubtype$prop_reassortant)
#[1] 0.4809028
sd(intersubtype$prop_reassortant)
#[1] 0.2480319
#Now lets look at genetic similarity, pairwise genetic identity (PID)
pairwise_genetic_identity <- read.csv("~/Dropbox/mixtup/Documentos/ucdavis/papers/influenza_GbBSeq_human/influenza_GbBSeq/data/pairwise_genetic_identity.csv", na.strings="")
#Make a strainAB column
pairwise_genetic_identity <- unite(pairwise_genetic_identity, strainAB, c("strainB", "strainA",), sep = "_", remove = F)
#Note need to flip order of strainB and strainA to match cross_stats
#Let's just calculate some basic stats on pairwise genetic identity
#First average percent identity
average_pid <- group_by(pairwise_genetic_identity, strainAB) %>%
summarise(count = n(),
average_pid = mean(pairwise_id)
)
# A tibble: 10 × 3
#strainAB count average_pid
#<chr> <int> <dbl>
#1 CA09_HK68 8 78.1
#2 CA09_PAN99 8 77.6
#3 CA09_SI86 8 82.8
#4 CA09_TX12 8 77.0
#5 HK68_PAN99 8 93.1
#6 HK68_SI86 8 82.1
#7 HK68_TX12 8 91.3
#8 PAN99_SI86 8 80.4
#9 PAN99_TX12 8 96.4
#10 SI86_TX12 8 81.0
#Second, average percent identity
average_pid_no_antigenic <- group_by(subset(pairwise_genetic_identity, segment != "HA" & segment != "NA"), strainAB) %>%
summarise(count = n(),
average_pid_no_antigenic = mean(pairwise_id)
)
# A tibble: 10 × 3
#strainAB count average_pid_no_antigenic
#<chr> <int> <dbl>
#1 CA09_HK68 6 86.6
#2 CA09_PAN99 6 85.6
#3 CA09_SI86 6 84.2
#4 CA09_TX12 6 85.0
#5 HK68_PAN99 6 94.2
#6 HK68_SI86 6 92.1
#7 HK68_TX12 6 92.8
#8 PAN99_SI86 6 89.3
#9 PAN99_TX12 6 97.0
#10 SI86_TX12 6 88.4
average_pid <- cbind(average_pid, average_pid_no_antigenic$average_pid_no_antigenic)
average_pid <- average_pid[, c(1,3,4)]
colnames(average_pid) <- c("strainAB", "average_pid", "average_pid_no_antigenic")
cross_stats <- right_join(cross_stats, average_pid)
#Now let's test for correlation between PID and reassortment rate
ggplot(cross_stats, aes(x = prop_reassortant, y = average_pid)) + geom_point(aes(colour = strainAB), size = 6) + geom_smooth(method = "lm") + theme(text = element_text(size = 20, family="Helvetica")) + theme(axis.text.x = element_text(size = 15)) + theme(panel.grid.major.y = element_line(color = "lightgray",size = 0.5))
reassortment_pid_model <- lm(prop_reassortant ~ average_pid, cross_stats)
summary(reassortment_pid_model)
#Call:
# lm(formula = prop_reassortant ~ average_pid, data = cross_stats)
#Residuals:
# Min 1Q Median 3Q Max
#-0.50354 -0.20585 0.00032 0.16816 0.42617