-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcontrol_infections_two_strain_cross2.R
1103 lines (846 loc) · 41.7 KB
/
control_infections_two_strain_cross2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
################ Control Experiment Infections ################
#### 1. Data Sources and Preparing Data
#### 2.
#### 3. Coinfection Supernatant Titers
#### 4. Population Genetic Statistics
#### 5. Testing Strain Specificity
#AFTER SENDING DRAFT - Search for this below
#This script conducts analyses on experiments to conduct control infections to test the GbBSeq approach
## Experiments
#We conducted several control experiments:
# 1. Biological replicates of HK68/CA09, HK68/PAN99,
#-HK68/CA09: Run A Cross 2 and 4
#-HK68/PAN99: Run A Cross 3 and 5
#-CA09/PAN99: Run A Cross 11, 13, and 17
#2. Varying MOI of HK/CA09 (these strains are plaque purified)
#-Run A Cross 2 MOI = 10
#-Run A Cross 10 MOI = 0.01
#-Run A Cross 6 MOI = 1
#To Do April 7, 2022 START HERE
# 1. Reassortment plots
# 2. Stat tests on these plots (prop tests on plots, exponential fit on MOI plot)
# 3. Down-sampling unique genotypes on replicate plots
# 4. **Note missing cross 15 file from Crick**
#Load libraries
library(dplyr)
library(ggplot2)
library(tidyr)
library(reshape2)
library(readr)
#### 1. Data Sources and Preparing Data ####
#Read files
file_names <- list.files("outputs/control_infections", "*_strain_database17_98_locus.txt", full.names = T)
#file_names <- file_names[c(3,4)]
#Going to add
two_strain_database17_98_locus <- NULL
for (file in file_names) {
df <- read.table(file, quote="\"", comment.char="")
two_strain_database17_98_locus <- rbind(two_strain_database17_98_locus, df)
}
#Assign names to columns
colnames(two_strain_database17_98_locus) <- c("cross_sample_locus", "total_reads", "majority_assigned_reads", "majority_strain_locus")
#Calculate Proportions
two_strain_database17_98_locus <- mutate(two_strain_database17_98_locus, proportion_assigned = majority_assigned_reads/total_reads)
#Quick quality control plot
qplot(two_strain_database17_98_locus$proportion_assigned, two_strain_database17_98_locus$total_reads)
cross_sample_locus <- gsub("usearch/all/","", two_strain_database17_98_locus$cross_sample_locus)
two_strain_database17_98_locus$cross_sample_locus <- gsub("_98_merged.b6","", cross_sample_locus)
#Then split cross sample locus
two_strain_database17_98_locus <- separate(two_strain_database17_98_locus, cross_sample_locus, c("cross", "sample", "locus"), sep = "_", remove=FALSE)
#Now create a cross_sample column to identify unique samples more easily
two_strain_database17_98_locus <- unite(two_strain_database17_98_locus, cross_sample, c("cross", "sample"), sep = "_", remove=FALSE)
#Now split majority strain and majority locus
two_strain_database17_98_locus <- separate(two_strain_database17_98_locus, majority_strain_locus, c("majority_strain", "majority_locus"), sep = "_", remove=FALSE)
#Check number of rows in two_strain_database17_98_locus
nrow(two_strain_database17_98_locus)
#[1] 6503
#Let's try our first plot
ggplot(two_strain_database17_98_locus, aes(x = locus, y = sample)) + geom_point(aes(col=majority_strain, alpha=proportion_assigned), shape=15, size = 6) + theme(axis.text.x = element_text(angle=90)) + facet_wrap(~ cross)
no_subtype_processing_plot <- ggplot(two_strain_database17_98_locus, aes(x = locus, y = sample)) + geom_point(aes(col=majority_strain, alpha=proportion_assigned), shape=15, size = 6) + theme(axis.text.x = element_text(angle=90)) + facet_wrap(~ cross)
#Okay, now add the information on what each cross is about
cross_data_runA <- read.csv("~/Dropbox/mixtup/Documentos/ucdavis/papers/influenza_GbBSeq_human/influenza_GbBSeq/data/cross_data_runA.csv")
#Prepare the cross_data_runA df for a merge with the big data set, so we can plot based on infection conditions
cross_data_runA <- mutate(cross_data_runA, cross = paste("cross", cross_id, sep = ""))
#Make a strain combination column for more convenient facetting
cross_data_runA <- mutate(cross_data_runA, strainAB = paste(strainA, strainB, sep = "_"))
#Join sample information
two_strain_database17_98_locus <- left_join(two_strain_database17_98_locus, cross_data_runA, by = "cross")
#Arrange facets by strain combination
ggplot(two_strain_database17_98_locus, aes(x = locus, y = sample)) + geom_point(aes(col=majority_strain, alpha=proportion_assigned), shape=15, size = 6) + theme(axis.text.x = element_text(angle=90)) + facet_wrap(~ strainAB)
#Summarise number of reads per cross
two_strain_database17_98_locus %>% group_by(cross) %>%
summarise(
total_cross_reads = sum(total_reads),
)
# A tibble: 9 x 2
#cross total_cross_reads
#<chr> <int>
#cross10 865786
#cross11 664352
#cross13 740611
#cross17 1199638
#cross2 1152974
#cross3 1197359
#cross4 859464
#cross5 1176569
#cross6 583357
#Summarise average number of reads per sample in each cross
reads_sample <- two_strain_database17_98_locus %>% group_by(cross_sample, cross) %>%
summarise(
total_cross_reads = sum(total_reads),
)
summary(reads_sample$total_cross_reads)
#Min. 1st Qu. Median Mean 3rd Qu. Max.
#30 5205 9378 9769 13690 24135
#Summarise number of reads per locus
reads_locus <- two_strain_database17_98_locus %>% group_by(locus) %>%
summarise(
total_locus_reads = sum(total_reads),
average_locus_reads = mean(total_reads),
)
summary(reads_locus$average_locus_reads)
#Min. 1st Qu. Median Mean 3rd Qu. Max.
#14.63 121.79 686.32 1261.37 2575.73 3323.07
mean(reads_locus$average_locus_reads)
#1261.366
sd(reads_locus$average_locus_reads)
#1353.948
#Looking at the plots, need to clean-up HA/NA for SI x HK which is intersubtype
#For now, just going to take any HA1b or NA1c as evidence that HA should be the one
#Need to make this into a function:
#################### New Antigenic Segment Processing - Refer ha_na.Rmd ####################
#Total Rows in DF
nrow(two_strain_database17_98_locus)
#[1] 6503
#Locus by cross
locus_by_cross <- two_strain_database17_98_locus %>% group_by(strainAB, locus) %>%
summarise(
number_samples = length(cross_sample),
)
#NA's
View(locus_by_cross[grep("NA", locus_by_cross$locus), ])
#HA's
View(locus_by_cross[grep("HA", locus_by_cross$locus), ])
##### First NA #####
#Lets check how many samples have any NA segments
nrow(two_strain_database17_98_locus[grep("NA", two_strain_database17_98_locus$locus), ])
#[1] 1193
#Put them in a data frame
na <- two_strain_database17_98_locus[grep("NA", two_strain_database17_98_locus$locus), ]
#Now check how many NA's called per sample/cross
na_by_sample <- group_by(na, cross_sample) %>%
summarise(
count = n()
)
#How many have 2 NA loci called?
nrow(subset(na_by_sample, count == 2))
#[1] 331
#Join that information into the na data frame
na <- left_join(na, na_by_sample)
#How many have 2 loci called
nrow(subset(na, count == 2))
#[1] 662
#Remember this is twice the number above, because here not looking by sample
#Let's subset to these only, the others we can leave untouched
na_two <- subset(na, count == 2)
#First, there should be no HK68_PAN99
filter(na_two, strainAB == "HK68_PAN99")
#<0 rows> (or 0-length row.names)
#Looks good.
#Now down the line
#Using a single rule for all NA1/NA3 decisions based on manual inspection
na_two_wider <- na_two %>% group_by(cross_sample, locus) %>%
summarise(
majority_assigned_reads = sum(majority_assigned_reads),
)
#View
#View(pivot_wider(na_two_wider, names_from = locus, values_from = majority_assigned_reads))
#Implement
na_two_wider <- pivot_wider(na_two_wider, names_from = locus, values_from = majority_assigned_reads)
#Number of rows
nrow(na_two_wider)
#331
#So one row per sample
ggplot(na_two_wider, aes(x = NA1c, y = NA3a)) + geom_point()
#So our strategy is to add a "remove" data frame, which will have a 'cross_sample_locus' column,
# that will then be used as a key to remove rows from the larger database
#Our starting rule is going to be NA3a's < 100 we want to give to NA1c, and NA1c's <3 we want to give to NA3a
#But first, let's just agree that any where NA1c is equal or majority, should be NA1c
#View(subset(na_two_wider, NA3a < NA1c))
nrow(subset(na_two_wider, NA3a <= NA1c))
#10
#So go ahead and mark the NA3a's in those samples as remove
#Create a data frame that I can then join to na_two_wider
cross_sample <- subset(na_two_wider, NA3a <= NA1c)$cross_sample
cross_sample_locus <- paste(subset(na_two_wider, NA3a <= NA1c)$cross_sample, "NA3a", sep = "_")
#Make temp remove data frame
remove_na <- data.frame(cross_sample,cross_sample_locus)
nrow(remove_na)
#10
#Now remove those from the na_two_wider data frame so I can whittle things down
#View(anti_join(na_two_wider, remove_na)) #321 looks good!
na_two_wider <- anti_join(na_two_wider, remove_na)
#Now among the ones where NA3a is greater than NA1c, let's assign remaining:
#We will recursively add to the remove DF, as we go through each condition, may circle back to this later
#First, remove those that do not meet criteria to assign to NA1c and add to
cross_sample <- c(cross_sample, subset(na_two_wider, NA1c < 3)$cross_sample)
cross_sample_locus <- c(cross_sample_locus, paste(subset(na_two_wider, NA1c < 3)$cross_sample, "NA1c", sep = "_"))
remove_na <- data.frame(cross_sample,cross_sample_locus)
nrow(remove_na)
#168
#Now again remove those from the na_two_wider data frame so I can whittle things down
#View(anti_join(na_two_wider, remove_na)) #163, moving down the line
na_two_wider <- anti_join(na_two_wider, remove_na)
#Second, now for those sample that meet criteria to assign to NA3a, I need to add NA1c to the remove for those loci
#View(subset(na_two_wider, NA3a > 100))
nrow(subset(na_two_wider, NA3a > 100))
#159
#Add recursively
cross_sample <- c(cross_sample, subset(na_two_wider, NA3a > 100)$cross_sample)
cross_sample_locus <- c(cross_sample_locus, paste(subset(na_two_wider, NA3a > 100)$cross_sample, "NA1c", sep = "_"))
#Build remove_na df again
remove_na <- data.frame(cross_sample,cross_sample_locus)
nrow(remove_na)
#327
#Now for prob final time remove those from the na_two_wider data frame so I can whittle things down
#View(anti_join(na_two_wider, remove_na)) #4, just these left!
na_two_wider <- anti_join(na_two_wider, remove_na)
#Finally, these remaining 5 ones are clearly NA1c, so need to label them as NA3a for removal
#No subsetting needed
cross_sample <- c(cross_sample, na_two_wider$cross_sample)
cross_sample_locus <- c(cross_sample_locus, paste(na_two_wider$cross_sample, "NA3a", sep = "_"))
#Build remove_na df for a final time
remove_na <- data.frame(cross_sample,cross_sample_locus)
nrow(remove_na)
#331 goal!!!
#Now let's do a quick sanity check, visually with view
View(
na_two %>% group_by(cross_sample, locus) %>%
summarise(
majority_assigned_reads = sum(majority_assigned_reads),
) %>% pivot_wider(names_from = locus, values_from = majority_assigned_reads) %>% left_join(remove_na)
)
#Gosh I love the tidyverse!
#Looks good enough especially at the extremes
#Now let's remove these offending rows from the big data frame!
#First check what are big data frame looks like before modifying
nrow(two_strain_database17_98_locus)
#6503
#And recall how many of the samples had duplicate NA calls
nrow(subset(na_by_sample, count == 2))
#[1] 331
#Recall that is the number of samples, so I had 331*2 = 662 rows in the data frame assoc w/ 2 NA's,
# so want to get that down to this amount:
nrow(two_strain_database17_98_locus) - nrow(subset(na_by_sample, count == 2))
#6172
#Prepare to actually remove rows
#View(two_strain_database17_98_locus[!two_strain_database17_98_locus$cross_sample_locus %in% remove_na$cross_sample_locus, ])
#Beautiful, isn't it?
#Actually change data frame
two_strain_database17_98_locus <- two_strain_database17_98_locus[!two_strain_database17_98_locus$cross_sample_locus %in% remove_na$cross_sample_locus, ]
#Now check
filter(two_strain_database17_98_locus, locus == "NA3a" | locus == "NA1c") %>%
group_by(cross_sample) %>%
summarise(
count = n()
)
#All have only 1 NA locus
#Now rename all the NA loci
two_strain_database17_98_locus$locus[grep("NA", two_strain_database17_98_locus$locus)] <- "NA"
#DONE!!!!!
##### End NA #####
##### Second HA #####
#Lets check how many samples have any HA segments
nrow(two_strain_database17_98_locus[grep("HA", two_strain_database17_98_locus$locus), ])
#[1] 843
#Put them in a data frame
ha <- two_strain_database17_98_locus[grep("HA", two_strain_database17_98_locus$locus), ]
#Now check how many HA's called per sample/cross
ha_by_sample <- group_by(ha, cross_sample) %>%
summarise(
count = n()
)
#How many have 2 HA loci called?
nrow(subset(ha_by_sample, count == 2))
#[1] 52
#Join that information into the ha data frame
ha <- left_join(ha, ha_by_sample)
#How many have 2 loci called
nrow(subset(ha, count == 2))
#[1] 104
#Remember this is twice the number above, because here not looking by sample
#Let's subset to these only, the others we can leave untouched
ha_two <- subset(ha, count == 2)
#First, there should be no CA09_SI86 samples or HK68_PAN99
filter(ha_two, strainAB == "HK68_PAN99")
#<0 rows> (or 0-length row.names)
#Looks good.
#Now down the line
#Using a single rule for all HA1/HA3 decisions based on manual inspection
ha_two_wider <- ha_two %>% group_by(cross_sample, locus) %>%
summarise(
majority_assigned_reads = sum(majority_assigned_reads),
)
#View
#View(pivot_wider(ha_two_wider, names_from = locus, values_from = majority_assigned_reads))
#Implement
ha_two_wider <- pivot_wider(ha_two_wider, names_from = locus, values_from = majority_assigned_reads)
#Number of rows
nrow(ha_two_wider)
#52
#So one row per sample
ggplot(ha_two_wider, aes(x = HA1b, y = HA3a)) + geom_point()
#So our strategy is to add a "remove" data frame, which will have a 'cross_sample_locus' column,
# that will then be used as a key to remove rows from the larger database
#First, let's just agree that any sample where HA1b is equal or majority, the HA3a locus should be marked for removal
#View(subset(ha_two_wider, HA3a <= HA1b))
nrow(subset(ha_two_wider, HA3a <= HA1b))
#2
#So go ahead and mark the HA3a's in those samples as remove
#Create a data frame that I can then join to ha_two_wider
cross_sample <- subset(ha_two_wider, HA3a <= HA1b)$cross_sample
cross_sample_locus <- paste(subset(ha_two_wider, HA3a <= HA1b)$cross_sample, "HA3a", sep = "_")
#Make temp remove data frame
remove_ha <- data.frame(cross_sample,cross_sample_locus)
nrow(remove_ha)
#2
#Now remove those from the na_two_wider data frame so I can whittle things down
#View(anti_join(ha_two_wider, remove_ha)) #50 made a small dent
ha_two_wider <- anti_join(ha_two_wider, remove_ha)
#Now among the ones where HA3a is greater than HA1b, let's assign remaining:
#We will recursively add to the remove DF, as we go through each condition, may circle back to this later
#Controls show that HA1b can't really be amplified by HA3a's. But very conservatively I will exclude
# singleton HA1b's across the board
#View(subset(ha_two_wider, HA1b == 1))
nrow(subset(ha_two_wider, HA1b == 1))
#31
#First, remove singleton HA1b's
cross_sample <- c(cross_sample, subset(ha_two_wider, HA1b == 1)$cross_sample)
cross_sample_locus <- c(cross_sample_locus, paste(subset(ha_two_wider, HA1b == 1)$cross_sample, "HA1b", sep = "_"))
remove_ha <- data.frame(cross_sample,cross_sample_locus)
nrow(remove_ha)
#33
#Now again remove those from the ha_two_wider data frame so I can whittle things down
#View(anti_join(ha_two_wider, remove_ha)) #33, making progress now
ha_two_wider <- anti_join(ha_two_wider, remove_ha)
#Second, will accept all HA1b's >2, when HA3a's are < 1000. So remove HA3a's under 1000.
#View(subset(ha_two_wider, HA3a < 1000))
nrow(subset(ha_two_wider, HA3a < 1000))
#8
#Add recursively
cross_sample <- c(cross_sample, subset(ha_two_wider, HA3a < 1000)$cross_sample)
cross_sample_locus <- c(cross_sample_locus, paste(subset(ha_two_wider, HA3a < 1000)$cross_sample, "HA3a", sep = "_"))
#Build remove_ha df again
remove_ha <- data.frame(cross_sample,cross_sample_locus)
nrow(remove_ha)
#41
#Now for prob final time remove those from the ha_two_wider data frame so I can whittle things down
#View(anti_join(ha_two_wider, remove_ha)) #11, just these left!
ha_two_wider <- anti_join(ha_two_wider, remove_ha)
#These are pretty clear HA3a's maaaaybe one or two potential toss ups close to 1000 HA3a reads but giving to HA3a
#No subsetting needed
cross_sample <- c(cross_sample, ha_two_wider$cross_sample)
cross_sample_locus <- c(cross_sample_locus, paste(ha_two_wider$cross_sample, "HA1b", sep = "_"))
#Build remove_ha df for a final time
remove_ha <- data.frame(cross_sample,cross_sample_locus)
nrow(remove_ha)
#52 goal!!!
#Now let's do a quick sanity check, visually with view
View(
ha_two %>% group_by(cross_sample, locus) %>%
summarise(
majority_assigned_reads = sum(majority_assigned_reads),
) %>% pivot_wider(names_from = locus, values_from = majority_assigned_reads) %>% left_join(remove_ha)
)
#Gosh I love the tidyverse!
#Looks good enough especially at the extremes
#Now let's remove these offending rows from the big data frame!
#First check what are big data frame looks like before modifying
nrow(two_strain_database17_98_locus)
#6172
#And recall how many of the samples had duplicate NA calls
nrow(subset(ha_by_sample, count == 2))
#[1] 52
#Recall that is the number of samples, so I had 52*2 = 104 rows in the data frame assoc w/ 2 HA's,
# so want to get that down to this amount:
nrow(two_strain_database17_98_locus) - nrow(subset(ha_by_sample, count == 2))
#6120
#Prepare to actually remove rows
#View(two_strain_database17_98_locus[!two_strain_database17_98_locus$cross_sample_locus %in% remove_ha$cross_sample_locus, ])
#Beautiful, isn't it?
#Actually change data frame
two_strain_database17_98_locus <- two_strain_database17_98_locus[!two_strain_database17_98_locus$cross_sample_locus %in% remove_ha$cross_sample_locus, ]
#Now check
filter(two_strain_database17_98_locus, locus == "HA3a" | locus == "HA1b") %>%
group_by(cross_sample) %>%
summarise(
count = n()
)
#All have only 1 HA locus
#Now rename all the HA loci
two_strain_database17_98_locus$locus[grep("HA", two_strain_database17_98_locus$locus)] <- "HA"
##### End HA #####
#Now should be able to generate the plots for each of the conditions
#First, replicate of standard (using crosess to subset because it's a bit complicated)
ggplot(subset(two_strain_database17_98_locus, cross == "cross2" | cross == "cross4" | cross == "cross3" | cross == "cross5" | cross == "cross13" | cross == "cross17"), aes(x = locus, y = sample)) + geom_point(aes(col=majority_strain, alpha=proportion_assigned), shape=15, size = 6) + theme(axis.text.x = element_text(angle=90)) + facet_wrap(strainAB ~ cross, nrow = 3)
#Leaving out cross == "cross11" to make it pairs
#Second by MOI
ggplot(subset(two_strain_database17_98_locus, cross == "cross4" | cross == "cross10" | cross == "cross6"), aes(x = locus, y = sample)) + geom_point(aes(col=majority_strain, alpha=proportion_assigned), shape=15, size = 6) + theme(axis.text.x = element_text(angle=90)) + facet_wrap(strainAB ~ as.numeric(moi), ncol =3)
#-Run A Cross 4 MOI = 10
#-Run A Cross 10 MOI = 0.01
#-Run A Cross 6 MOI = 1
#Now run some numbers and stats tests
#t-test for trypsin no-trypsin (may need to control for strains)
#regression? ordered ANOVA for timepoints OR binomial regression? Reassortant/not
#Plot
ggplot(two_strain_database17_98_locus, aes(x = locus, y = sample)) + geom_point(aes(col=majority_strain, alpha=proportion_assigned), shape=15, size = 6) + theme(axis.text.x = element_text(angle=90)) + facet_wrap(~ cross)
controls_df <- two_strain_database17_98_locus
## Remove PB1 ##
nrow(controls_df)
#6120
nrow(filter(controls_df, locus != "PB1c"))
#5706
#Should not affect number of samples/clones
#Execute
controls_df <- filter(controls_df, locus != "PB1c")
## Remove PB1 ##
#First need to get the number of typed loci for each sample
controls_df_loci_numbers <- controls_df %>%
group_by(cross_sample) %>%
summarise(
total_segments = length(locus),
max_majority_strain = max(table(majority_strain))
)
controls_df <- right_join(controls_df, controls_df_loci_numbers)
#Now should be able to generate the plots for each of the conditions
#First, replicate of standard (using crosess to subset because it's a bit complicated)
ggplot(subset(controls_df, cross == "cross2" & total_segments > 6 | cross == "cross4" & total_segments > 6 | cross == "cross3" & total_segments > 6 | cross == "cross5" & total_segments > 6 | cross == "cross11" & total_segments > 6 | cross == "cross13" & total_segments > 6 | cross == "cross15" & total_segments > 6), aes(x = locus, y = sample)) + geom_point(aes(col=majority_strain, alpha=proportion_assigned), shape=15, size = 6) + theme(axis.text.x = element_text(angle=90)) + facet_wrap(strainAB ~ cross, nrow =3)
#Second by MOI
ggplot(subset(controls_df, cross == "cross4" & total_segments > 6 | cross == "cross10" & total_segments > 6 | cross == "cross6" & total_segments > 6), aes(x = locus, y = sample)) + geom_point(aes(col=majority_strain, alpha=proportion_assigned), shape=15, size = 6) + theme(axis.text.x = element_text(angle=90)) + facet_wrap(strainAB ~ as.numeric(moi), ncol =3)
#-Run A Cross 4 MOI = 10
#-Run A Cross 10 MOI = 0.01
#-Run A Cross 6 MOI = 1
#Lose a ton of samples, should look into removing PB1, if no different keep all data
#Now sort by parentals
#First, replicate of standard (using crosess to subset because it's a bit complicated)
ggplot(subset(controls_df, cross == "cross2" | cross == "cross4" | cross == "cross3" | cross == "cross5" | cross == "cross11" | cross == "cross13" | cross == "cross17"), aes(x = locus, y = reorder(sample, max_majority_strain))) + geom_point(aes(col=majority_strain, alpha=proportion_assigned), shape=15, size = 6) + theme(axis.text.x = element_text(angle=90)) + facet_wrap(strainAB ~ cross, nrow =3)
#Second by MOI
ggplot(subset(controls_df, cross == "cross4" | cross == "cross10" | cross == "cross6"), aes(x = locus, y = reorder(sample, max_majority_strain))) + geom_point(aes(col=majority_strain, alpha=proportion_assigned), shape=15, size = 6) + theme(axis.text.x = element_text(angle=90)) + facet_wrap(strainAB ~ as.numeric(moi), ncol =3)
#-Run A Cross 2 MOI = 10
#-Run A Cross 10 MOI = 0.01
#-Run A Cross 6 MOI = 1
######### Using 8 segments ##############
#Now let's generate some numbers!
parentals <- subset(controls_df, total_segments > 7) %>%
group_by(cross_sample, cross, sample) %>%
summarise(
parental = max_majority_strain == total_segments,
#total_segments == max_majority_strain
)
parentals <- distinct(parentals)
table(parentals$cross, parentals$parental)
#Group by cross_sample and calculate number of parents for each
controls_df_number_parents <- controls_df %>%
group_by(cross, cross_sample) %>%
summarise(
number_parents = length(unique(majority_strain)),
#constellation = paste(majority_strain_locus, sep = ","),
max_majority_strain = max(table(majority_strain))
)
nrow(subset(controls_df_number_parents, number_parents == 2))
#305
nrow(controls_df_number_parents)
#[1] 864
#Make a reasortant column
controls_df_number_parents <- mutate(controls_df_number_parents, reassortant = ifelse(number_parents > 1, yes = 1, no = 0))
cross_stats <- controls_df_number_parents %>%
group_by(cross) %>%
summarise(
clones = length(number_parents),
reassortants = sum(reassortant)
#constellation = paste(majority_strain_locus, sep = ","),
#max_majority_strain = max(table(majority_strain))
)
#Add proportion reassortant to data frame
cross_stats <- mutate(cross_stats, prop_reassortant = reassortants / clones)
#Now add cross data
cross_stats <- left_join(cross_stats, cross_data_runA, by = "cross")
#Plot reasortment proportions
#First, replicate of standard (using crosess to subset because it's a bit complicated)
#Figure 1A
ggplot(subset(cross_stats, cross == "cross2" | cross == "cross4" | cross == "cross3" | cross == "cross5" | cross == "cross11" | cross == "cross13" | cross == "cross17"), aes(x = strainAB, y = prop_reassortant)) + xlab("Experimental Coinfection: Strain Combination") + ylab("Proportion of Reassortant Plaque Isolates") + geom_point(size = 3) + theme_tufte() + theme(text = element_text(size = 20, family="Helvetica")) + theme(axis.text.x = element_text(size = 12)) + theme(legend.position = "none") + theme(panel.grid.major.y = element_line(color = "lightgray",size = 0.5))
#Difference in each reassortment rate estimate within
reassortment_rate_replicate_estimate <- subset(cross_stats, cross == "cross2" | cross == "cross4" | cross == "cross3" | cross == "cross5" | cross == "cross11" | cross == "cross13" | cross == "cross17")
subset(reassortment_rate_replicate_estimate, strainAB == "CA09_HK68")
#0.156 - 0.0417 = 0.1143
subset(reassortment_rate_replicate_estimate, strainAB == "CA09_PAN99")
#0.5-0.438 = 0.062
#0.5 - 0.406 = 0.094
#0.438 - 0.406 = 0.032
subset(reassortment_rate_replicate_estimate, strainAB == "HK68_PAN99")
#0.781-0.771 = 0.01
#Average deviation
mean(c(0.1143, 0.062, 0.094, 0.032, 0.01))
#0.06246
#Figure 1B
#Second by MOI
ggplot(subset(cross_stats, cross == "cross10" | cross == "cross6" | cross == "cross4"), aes(x = moi, y = prop_reassortant)) + xlab("Multiplicity of Infection") + ylab("Proportion of Reassortant Plaque Isolates") + scale_x_log10() + geom_point(size = 3) + geom_smooth(method = "lm", formula = y ~ exp(x), se = T) + theme_tufte() + theme(text = element_text(size = 20, family="Helvetica")) + theme(axis.text.x = element_text(size = 12)) + theme(legend.position = "none") + theme(panel.grid.major.y = element_line(color = "lightgray",size = 0.5))
######### Using 8 segments ##############
######### Using 7 segments ##############
#Now let's generate some numbers!
parentals <- subset(controls_df, total_segments > 6) %>%
group_by(cross_sample, cross, sample) %>%
summarise(
parental = max_majority_strain == total_segments,
#total_segments == max_majority_strain
)
parentals <- distinct(parentals)
table(parentals$cross, parentals$parental)
#Group by cross_sample and calculate number of parents for each
controls_df_number_parents <- controls_df %>%
group_by(cross, cross_sample) %>%
summarise(
number_parents = length(unique(majority_strain)),
#constellation = paste(majority_strain_locus, sep = ","),
max_majority_strain = max(table(majority_strain))
)
nrow(subset(controls_df_number_parents, number_parents == 2))
#305
nrow(controls_df_number_parents)
#[1] 864
#Make a reasortant column
controls_df_number_parents <- mutate(controls_df_number_parents, reassortant = ifelse(number_parents > 1, yes = 1, no = 0))
cross_stats <- controls_df_number_parents %>%
group_by(cross) %>%
summarise(
clones = length(number_parents),
reassortants = sum(reassortant)
#constellation = paste(majority_strain_locus, sep = ","),
#max_majority_strain = max(table(majority_strain))
)
#Add proportion reassortant to data frame
cross_stats <- mutate(cross_stats, prop_reassortant = reassortants / clones)
#Now add cross data
cross_stats <- left_join(cross_stats, cross_data_runA, by = "cross")
#Plot reasortment proportions
#First, replicate of standard (using crosess to subset because it's a bit complicated)
ggplot(subset(cross_stats, cross == "cross2" | cross == "cross4" | cross == "cross3" | cross == "cross5" | cross == "cross11" | cross == "cross13" | cross == "cross17"), aes(x = strainAB, y = prop_reassortant)) + geom_point()
#Second by MOI
ggplot(subset(cross_stats, cross == "cross10" | cross == "cross6" | cross == "cross4"), aes(x = as.factor(moi), y = prop_reassortant)) + geom_point()
######### Using 7 segments ##############
######### No PB1 ##############
#Now let's generate some numbers!
#Now let's go ahead and drop PB1
#How many rows?
nrow(controls_df)
#6120
nrow(filter(controls_df, locus != "PB1c"))
#5706
#Should not affect number of samples/clones
#Execute
controls_df <- filter(controls_df, locus != "PB1c")
parentals <- subset(controls_df, total_segments > 6) %>%
group_by(cross_sample, cross, sample) %>%
summarise(
parental = max_majority_strain == total_segments,
#total_segments == max_majority_strain
)
parentals <- distinct(parentals)
table(parentals$cross, parentals$parental)
#Group by cross_sample and calculate number of parents for each
controls_df_number_parents <- controls_df %>%
group_by(cross, cross_sample) %>%
summarise(
number_parents = length(unique(majority_strain)),
#constellation = paste(majority_strain_locus, sep = ","),
max_majority_strain = max(table(majority_strain))
)
nrow(subset(controls_df_number_parents, number_parents == 2))
#542
nrow(controls_df_number_parents)
#[1] 864
#Make a reasortant column
controls_df_number_parents <- mutate(controls_df_number_parents, reassortant = ifelse(number_parents > 1, yes = 1, no = 0))
cross_stats <- controls_df_number_parents %>%
group_by(cross) %>%
summarise(
clones = length(number_parents),
reassortants = sum(reassortant)
#constellation = paste(majority_strain_locus, sep = ","),
#max_majority_strain = max(table(majority_strain))
)
#Add proportion reassortant to data frame
cross_stats <- mutate(cross_stats, prop_reassortant = reassortants / clones)
#Now add cross data
cross_stats <- left_join(cross_stats, cross_data_runA, by = "cross")
#Plot reasortment proportions
#First, replicate of standard (using crosess to subset because it's a bit complicated)
ggplot(subset(cross_stats, cross == "cross2" | cross == "cross4" | cross == "cross3" | cross == "cross5" | cross == "cross11" | cross == "cross13" | cross == "cross15"), aes(x = strainAB, y = prop_reassortant)) + geom_point()
#Second by MOI
ggplot(subset(cross_stats, cross == "cross2" | cross == "cross10" | cross == "cross6"), aes(x = as.factor(moi), y = prop_reassortant)) + geom_point()
######### No PB1 ##############
#### 4. Downsampling's effect on unique genotypes, reassortants ####
#We have reassortment data, now let's generate unique genotype data
#Making a table for the GenAlEx format for population genetics analyses
genotypes_table <- group_by(controls_df, cross_sample, strainAB, locus) %>%
summarise(
majority_strain = majority_strain,
)
#Looking good!
genotypes_table <- spread(genotypes_table, locus, majority_strain)
#Merge all genotypes into one and make a code for each genotype
genotypes_table <- unite(genotypes_table, "genotype", c("HA", "Mg", "NA", "NPd", "NS1d", "PAc", "PB1c", "PB2f"), remove = F)
genotypes_table <- separate(genotypes_table, cross_sample, into = c("cross", "sample"), sep = "_", remove = F)
#Graph by cross first
ggplot(genotypes_table, aes(x = strainAB, fill = genotype)) + geom_bar() + theme(legend.position = "none")
genotype_counts <- group_by(genotypes_table, cross, strainAB, genotype) %>%
summarise(
number_plaques = n()
)
#Graph Counts per cross
ggplot(genotype_counts, aes(x = genotype, y = number_plaques, fill = genotype)) + geom_col() + theme(legend.position = "none", axis.text.x = element_blank()) + facet_wrap(~ strainAB, ncol = 1, scales = "free")
#Now need to count the number of unique genotypes per cross
genotype_counts_cross <- group_by(genotype_counts, cross, strainAB) %>%
summarise(
total_genotypes = length(unique(genotype))
)
genotypes_df <- right_join(genotype_counts, genotype_counts_cross, by = c("cross", "strainAB"))
#Same graph as above
ggplot(genotypes_df, aes(x = genotype, y = number_plaques, fill = genotype)) + geom_col() + theme(legend.position = "none", axis.text.x = element_blank()) + facet_wrap(~ strainAB, ncol = 1, scales = "free")
#Now genotypes per cross
ggplot(genotype_counts_cross, aes(x = reorder(strainAB, total_genotypes), y = total_genotypes)) + geom_col()
#Old reasortment graph from above
ggplot(cross_stats, aes(x = reorder(cross, prop_reassortant), y = prop_reassortant)) + geom_col() + geom_hline(yintercept = 0.40, linetype = 2, color = "grey", alpha = 0.75) + geom_hline(yintercept = 0.9921875, linetype = 2, color = "red", alpha = 0.75)
#Now let's merge these two into one data frame to compare
cross_stats <- right_join(cross_stats, genotype_counts_cross, by = c("cross", "strainAB"))
#Side by side
ggplot(cross_stats, aes(x = reorder(strainAB, prop_reassortant), y = prop_reassortant)) + geom_col() + geom_hline(yintercept = 0.40, linetype = 2, color = "red", alpha = 0.75)
#Genotypes per cross
ggplot(genotype_counts_cross, aes(x = reorder(strainAB, total_genotypes), y = total_genotypes)) + geom_col()
#Clearly correlated
ggplot(cross_stats, aes(x = prop_reassortant, y = total_genotypes)) + geom_point(aes(colour = strainAB)) + geom_smooth(method = "lm")
#Quick stats
reassortment_genotypes_model <- lm(total_genotypes ~ reassortants, cross_stats)
summary(reassortment_genotypes_model)
#NS
#Call:
# lm(formula = total_genotypes ~ reassortants, data = cross_stats)
#Residuals:
# Min 1Q Median 3Q Max
#-23.5162 -0.9902 3.1114 5.4161 13.4439
#Coefficients:
# Estimate Std. Error t value Pr(>|t|)
#(Intercept) 5.0579 6.2471 0.810 0.44479
#reassortants 0.6554 0.1435 4.566 0.00259 **
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#Residual standard error: 11.33 on 7 degrees of freedom
#Multiple R-squared: 0.7486, Adjusted R-squared: 0.7127
#F-statistic: 20.85 on 1 and 7 DF, p-value: 0.002586
#AFTER SENDING DRAFT Cool, even works with this but need to adjust for number of plaques as in pairwise
#AFTER SENDING DRAFT - Also did I remove incomplete genotypes? Not reporting above, though
#Now let's do rarefaction analysis, but on real genotypes
#So I am looking to focus on the genotypes column
nrow(genotypes_table)
#864
genotypes_table_completes <- na.omit(genotypes_table)
nrow(genotypes_table_completes)
#259
#Let's start with HK68_PAN99 and treat both crosses as one (there should be three of these)
hk_pan_genotypes <- as.vector(subset(genotypes_table_completes, strainAB == "HK68_PAN99", select = genotype))
#Number of total genotypes in HK/PAN crosses
nrow(hk_pan_genotypes)
#96
nrow(unique(hk_pan_genotypes))
#40
#a little over half are unique
#So now need to sample from these genotypes, like so:
sample(hk_pan_genotypes$genotype, 1)
#Need to do this 1000 times and see the effect of sampling
#This loop takes a fixed number of attempts and figures out how many unique genotypes you get
n <- 1
simulation_df <- NULL
set.seed(99)
for (n in 1:1000) {
uniques <- NULL
simul_genotypes <- NULL
tries <- NULL
current_try <- NULL
i <- 1
while(i < 97) {
simul_genotypes <- rbind(simul_genotypes, sample(hk_pan_genotypes$genotype, 1))
nrow(simul_genotypes)
uniques <- nrow(unique(simul_genotypes))
#print(uniques)
current_try <- data.frame(attempt = i, unique_genotypes = as.numeric(uniques), trial_number = n)
tries <- rbind(tries, current_try)
#trial_number <- rep(n, times = nrow(tries))
#tries <- cbind(tries, trial_number)
i <- i+1
}
n <- n + 1
simulation_df <- rbind(simulation_df, tries)
}
#Ok, that worked surprisingly well
#Let's add the observed crosses as points manually, to provide context
#Cross 3 and cross 5 are the ones in question here for HK68_PAN99
genotypes_table_completes %>% filter(strainAB == "HK68_PAN99") %>% group_by(cross) %>%
summarise(
total_plaques = length(genotype),
genotypes = length(unique(genotype)),
)
## A tibble: 2 x 3
#cross total_plaques genotypes
#<chr> <int> <int>
#cross3 66 31
#cross5 30 19
#So I got 31 unique genotypes / 66 in one attempt and 19/30. Not bad!
#Let's plot
ggplot(simulation_df, aes(x = attempt, y = unique_genotypes, colour = trial_number)) + xlab("Number of Plaque Isolates with Complete Genotypes") + ylab("Number of Total Unique Genotypes") + geom_point(aes(alpha=0.3)) + geom_point(aes(x=31, y=19), colour="red", size = 3.5) + geom_point(aes(x=66, y=31), colour="red", size = 3.5) + theme(legend.position = "none")
simulation_df %>% filter(attempt == 96) %>% group_by(attempt) %>%
summarise(
min_unique = min(unique_genotypes),
max_unique = max(unique_genotypes),
mean_unique = mean(unique_genotypes),
sd_unique = sd(unique_genotypes),
n_trials = n(),
)
# attempt min_unique max_unique mean_unique sd_unique n_trials
# <dbl> <dbl> <dbl> <dbl> <dbl> <int>
# 96 22 38 29.4 2.36 1000
#So this means I never hit my full 40 genotypes in and only 1/1000 do I come close (38)
#One of the crosses actually is pretty good, coming in at the average of 96 samples, while sampling only 66
#How does this look different with another strain combo:
#Lets move on to CA_PAN99 and treat both crosses as one (there should be three of these)
ca_pan_genotypes <- as.vector(subset(genotypes_table_completes, strainAB == "CA09_PAN99", select = genotype))
#Number of total genotypes in CA/PAN crosses
nrow(ca_pan_genotypes)
#132
nrow(unique(ca_pan_genotypes))
#21
#surprisingly low!
#So now need to sample from these genotypes, like so:
sample(ca_pan_genotypes$genotype, 1)
#Need to do this 1000 times and see the effect of sampling
#This loop takes a fixed number of attempts and figures out how many unique genotypes you get
n <- 1
simulation_df <- NULL
set.seed(09)
for (n in 1:1000) {
uniques <- NULL
simul_genotypes <- NULL
tries <- NULL
current_try <- NULL
i <- 1
while(i < 97) {
simul_genotypes <- rbind(simul_genotypes, sample(ca_pan_genotypes$genotype, 1))
nrow(simul_genotypes)
uniques <- nrow(unique(simul_genotypes))
#print(uniques)
current_try <- data.frame(attempt = i, unique_genotypes = as.numeric(uniques), trial_number = n)
tries <- rbind(tries, current_try)
#trial_number <- rep(n, times = nrow(tries))
#tries <- cbind(tries, trial_number)
i <- i+1
}
n <- n + 1
simulation_df <- rbind(simulation_df, tries)
}
#Ok, that worked surprisingly well
#Let's add the observed crosses as points manually, to provide context
#Cross 11, 13, and 17 are the ones in question here for CA09_PAN99
genotypes_table_completes %>% filter(strainAB == "CA09_PAN99") %>% group_by(cross) %>%
summarise(
total_plaques = length(genotype),
genotypes = length(unique(genotype)),
)
## A tibble: 3 x 3
#cross total_plaques genotypes
#<chr> <int> <int>
#cross11 11 5
#cross13 47 6
#cross17 74 11