-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain_gag_sca.m
124 lines (89 loc) · 3.78 KB
/
main_gag_sca.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
%%
% *CODE FOR IDENTIFYING CO-EVOLUTIONARY SECTORS IN HIV GAG USING SCA*
% Adpated from [Halabi2009]
% Halabi, N., Rivoire, O., Leibler, S. & Ranganathan, R.
% Protein sectors: evolutionary units of three-dimensional structure.
% Cell 138, 774?86 (2009).
%% Setting up paths (of functions and data files required) and necessary parameters
clear all;close all;clc
% Adding paths of required functions and datafiles
addpath functions
addpath datafiles
% Setting font type and size
set(0,'DefaultAxesFontName','Arial')
set(0,'DefaultTextFontName','Arial')
set(0,'DefaultAxesFontSize',20)
set(0,'DefaultTextFontSize',20)
% Specifying the name of protein, and whether to run the shuffling code
protein = 'Gag';
%% Loading biochemical domains and immunological information
biodomain = biochemical_domains(protein);
%% Preprocessing the data
load msa_gag
[B,Bcap,lambda,site_freq_no_mutation,true_indices,freq_bin,prev_aa,N,M,ls] = preprocessing_gag(msa);
%% SCA
B = double(~B);
% definition of binary MSA is opposite in SCA with 0 for mutant and
% 1 for most prevalent amino acid at a site
freq_bg=[.073 .025 .050 .061 .042 .072 .023 .053 .064 .089...
.023 .043 .052 .040 .052 .073 .056 .063 .013 .033];
% Background probabilities for the prevalent amino acids:
freq_bg_bin=freq_bg(prev_aa);
%SCA weighting matrix
W=log(freq_bin.*(1-freq_bg_bin)./(freq_bg_bin.*(1-freq_bin)));
%SCA matrix:
freq_pairs_bin=B'*B/N;
C_bin=freq_pairs_bin-freq_bin'*freq_bin;
C_sca=(W'*W).*abs(C_bin);
[eigvec_sca,lambda_sca] = eig_sort(C_sca);
%Shuffling the MSA to obtain number of significant eigenvectors
N_samples=100; %No. of randomized samples
lambda_rnd=zeros(N_samples,1);
for s=1:N_samples
B_rnd=zeros(N,M);
for pos=1:M
perm_seq=randperm(N);
B_rnd(:,pos)=B(perm_seq(:),pos);
end
freq_pairs_bin_rnd=B_rnd'*B_rnd/N;
C_bin_rnd=freq_pairs_bin_rnd-freq_bin'*freq_bin;
C_rnd = (W'*W).*abs(C_bin_rnd);
[eigvect_unsorted,lambda_unsorted]=eig(C_rnd);
lambda_rnd(s)=max(diag(lambda_unsorted));
end
alpha = sum(lambda_sca>max(lambda_rnd))
%The above procedure always results in alpha = 1 for SCA. However, we set
%the value of alpha equal to the one obtained using the Pearson
%correlation matrix based method (PCA and SPCA). Note that this value was
%chosen by inspection in Halabi et al. 2009.
alpha = 6
%% Forming SCA sectors
epsilon = 2/sqrt(M);
%sector with indices according to mutating sites (M = 451)
sec_eig_sca = cell(1,alpha);
%sector with indices according to Gag (ls = 500)
sec_eig_sca_true = cell(1,alpha);
%sector with indices according to Gag and 100% conserved sites incorporated
sec_eig_sca_incl_cs = cell(1,alpha);
for kk = 2:alpha+1
sec_eig_sca{kk-1} = find(abs(eigvec_sca(:,kk))>epsilon);
sec_eig_sca_true{kk-1} = true_indices(sec_eig_sca{kk-1});
sec_eig_sca_incl_cs{kk-1} = combining_sectors_with_conserved_sites(sec_eig_sca_true{kk-1},site_freq_no_mutation,1,ls);
end
%% Statistical signficance of biochemical association of SCA sectors
%Table 2
sec_asso_sca = zeros(1,length(biodomain));
%sector associated to a particular biochemical domain
pvalue_asso_sca = zeros(1,length(biodomain));
%statistical significance of the sector-biochemical domain association
for kk = 1:length(biodomain)
[sec_asso_sca(kk),pvalue_asso_sca(kk)] = ...
compute_association(biodomain(kk).sites,sec_eig_sca_incl_cs,sec_eig_sca_true,ls,M);
end
fprintf('\n-----------------------------------------------------------------------------------\n')
fprintf('Significance of inferred %s sectors using SCA\n',protein)
fprintf('-----------------------------------------------------------------------------------\n')
for kk = 1:length(biodomain)
fprintf('Sector %d is associated with %s (P = %.2e).\n',...
sec_asso_sca(kk),biodomain(kk).name,pvalue_asso_sca(kk));
end