-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathTNER_main.R
234 lines (195 loc) · 17.7 KB
/
TNER_main.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#################################################################
# TNER: Tri-Nucleotide Error Reducer
# TNER is a novel background error suppression tool that provides a robust
# estimation of background noise to reduce sequencing errors using tri-nucleotide
# context data for better identification of low frequency somatic mutations in
# ctDNA (Circulating tumor DNA).
# Last updated: 08/20/2018
# Contact: shibing.deng {at} pfizer.com & tao.xie {at} pfizer.com or xietao2000 {at} gmail.com
# Pfizer Early Clinical Development Biostatistics
# Pfizer Early Oncology Development and Clinical Research
#################################################################
#################################################################
# Terms of Use for the TNER (Tri-Nucleotide Error Reducer) package
# Pfizer has attempted to provide accurate and current information on TNER. However, Pfizer makes no representations or warranties that the information contained or published on TNER will be suitable for your specific purposes or for any other purposes. You agree to indemnify, defend, and hold Pfizer harmless from all claims, damages, liabilities and expenses, including without limitation reasonable attorney's fees and costs, whether or not a lawsuit or other proceeding is filed, that in any way arise out of or relate to your use of TNER.
# ALL INFORMATION AND DATA PROVIDED ON TNER IS PROVIDED "AS-IS" WITHOUT WARRANTY OF ANY KIND. PFIZER MAKES NO REPRESENTATIONS OR WARRANTIES CONCERNING TNER OR ANY OTHER MATTER WHATSOEVER, INCLUDING WITHOUT LIMITATION ANY EXPRESS, IMPLIED OR STATUTORY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT OF THIRD PARTY RIGHTS, TITLE, ACCURACY, COMPLETENESS OR ARISING OUT OF COURSE OF CONDUCT OR TRADE CUSTOM OR USAGE, AND DISCLAIMS ALL SUCH EXPRESS, IMPLIED OR STATUTORY WARRANTIES. PFIZER MAKES NO WARRANTY OR REPRESENTATION THAT YOUR USE OF TNER WILL NOT INFRINGE UPON THE INTELLECTUAL PROPERTY OR OTHER RIGHTS OF ANY THIRD PARTY. FURTHER, PFIZER SHALL NOT BE LIABLE IN ANY MANNER WHATSOEVER FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, CONSEQUENTIAL OR EXEMPLARY DAMAGES ARISING OUT OF OR IN ANY WAY RELATED TO TNER, THE USE OF, OR INABILITY TO USE, ANY OF THE INFORMATION OR DATA CONTAINED OR REFERENCED IN TNER, OR ANY OTHER MATTER. THE FOREGOING EXCLUSIONS AND LIMITATIONS SHALL APPLY TO ALL CLAIMS AND ACTIONS OF ANY KIND AND ON ANY THEORY OF LIABILITY, WHETHER BASED ON CONTRACT, TORT OR ANY OTHER GROUNDS, AND REGARDLESS OF WHETHER A PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES, AND NOTWITHSTANDING ANY FAILURE OF ESSENTIAL PURPOSE OF ANY LIMITED REMEDY. BY USING TNER, YOU FURTHER AGREE THAT EACH WARRANTY DISCLAIMER, EXCLUSION OF DAMAGES OR OTHER LIMITATION OF LIABILITY HEREIN IS INTENDED TO BE SEVERABLE AND INDEPENDENT OF THE OTHER CLAUSES OR SENTENCES BECAUSE THEY EACH REPRESENT SEPARATE ELEMENTS OF RISK ALLOCATION BETWEEN THE PARTIES.
# TNER may contain information from third party websites, which may or may not be marked with the name of the source. Such information does not necessarily represent the views or opinions of Pfizer, and Pfizer shall have no responsibility whatsoever for such information. All information from third party websites are the sole responsibility of the person or entity that provides and/or maintains such website. As a TNER user, you are solely responsible for any information that you display, generate, transmit or transfer while using TNER, and for the consequences of such actions.
# Should any TNER user provide general, scientific or other feedback information, whether in the form of questions, comments, or suggestions to Pfizer, regarding the content of Pfizer's website or otherwise, such information shall not be deemed to be confidential or proprietary to you or to any other party. Pfizer shall have no obligation of any kind with respect to such information and Pfizer shall have the right, without limitation, to reproduce, use, disclose, merge, display, make derivatives of and distribute such information to others. Pfizer shall also have the right, without limitation, to use and exploit such information, including ideas, concepts, know-how, inventions, techniques or other materials contained in such information for any purpose whatsoever, including but not limited to, making changes or improvements to TNER and/or developing, manufacturing, marketing, selling or distributing products and technologies incorporating such information. However, you agree that Pfizer has no obligation whatsoever to respond to your comments or to change or correct any information on TNER based on your comments.
# Pfizer reserves the right to alter the content of TNER in any way, at any time and for any reason, with or without prior notice to you, and Pfizer will not be liable in any way for possible consequences of such changes or for inaccuracies, typographical errors or omissions in the contents hereof. Nothing contained herein shall be construed as conferring by implication, estoppel or otherwise any license or right under any patent, trademark or other intellectual property of Pfizer or any third party. Except as expressly provided above, nothing contained herein shall be construed as conferring any right or license under any Pfizer copyrights.
# Pfizer also reserves the right to modify these Terms of Use at any time and for any reason, with or without prior notice to you. You should always review the most current Terms of Use herein before using this website. By using TNER, you agree to the current Terms of Use posted on this site. You also agree that these Terms of Use constitute the entire agreement between you and Pfizer regarding the subject matter hereof and supersede all prior and contemporaneous understandings, oral or written, regarding such subject matter. In addition, if any provision of these Terms of Use is found by a court of competent jurisdiction to be invalid, void or unenforceable, the remaining provisions shall remain in full force and effect, and the affected provision shall be revised so as to reflect the original intent of the parties hereunder to the maximum extent permitted by applicable law.
# BY USING TNER, YOU ACKNOWLEDGE AND AGREE THAT YOU HAVE READ, UNDERSTOOD AND AGREE TO ALL OF THESE TERMS OF USE.
#################################################################
#################################################################
# Command line usage:
# Rscript TNER_main.R TNER_example_input_file.txt hs_ave_bg_error.csv hs_depth.csv
#################################################################
# Input:
# There can be up to three input parameters, the first one (required) is a nucleotide frequency file in txt format from the sample needs to be polished (can be generated using the pileup2actg.pl);
# the 2nd one is the average background error rate file calculated from healthy subjects (default name: "hs_ave_bg_error.csv");
# and the 3rd one is the coverage depth file (default name: "hs_depth.csv"). Both hs_ave_bg_error.csv and hs_depth.csv can be generated using "Create_background_error_rate.R" with user's normal control data.
########
# Output:
# TNER will create an error polished nucleotide frequency file ("*.clean.txt");
# and a txt file showing background errors reported by TNER ("*.noise.txt").
#
#################################################################
args = commandArgs(trailingOnly=TRUE) # 1-3 arguments from input
work.dir="." #working directory is where the input and output file stored
# Some optional parameters to tune
input.alpha = 0.01 #Polish strength parameter
#input.filter = FALSE #turn on the filter to filter out those bases with abnormal data
# processing the input
#####################
tnt.rate.file=tnt.depth.file=NULL
if (length(args)==0) {
stop("Usage: Rscript TNER_main.R TNER_example_input_file.txt hs_ave_bg_error.csv hs_depth.csv.", call.=FALSE)
} else if (length(args)>1) {
# default output file
tnt.rate.file=args[2] # Average bkgrd rate file
if (length(args)>2) tnt.depth.file=args[3] # 3nt bkgrd rate file
}
input.file=args[1] # first argument is input file
#the input file can be a single file (.txt, .freq, etc) or a list of files saved in a csv file
if (length(grep('csv',input.file))>0) input.pileup.file=unlist(read.csv(input.file,header=F,as.is=T)) else
input.pileup.file=input.file
if (is.null(tnt.rate.file)) tnt.rate.file="hs_ave_bg_error.csv" # provide the default average background error rate file name if no input is provided
if (is.null(tnt.depth.file)) tnt.depth.file="hs_depth.csv" else # provide the default depth file name if it is not provided by input
if (grepl('^[0-9]+$',tolower(tnt.depth.file))) { tnt.depth=as.numeric(tnt.depth.file) # if depth file is a number
tnt.depth.file=NULL
if (tnt.depth<1) stop("coverage depth input value invalid: positive integer)")}
# end of processing input
s=c("A","C","T","G") # four nucleotide letter
#read in the average background error and depth file - it must be a csv file
######################################################
# check if there is a rowname column based on first row and 1st column value
h1=scan(tnt.rate.file,what="character",nlines=1,sep=",")
if (h1[1]=="") hs.bkg.ave = read.csv(tnt.rate.file,row.names = 1) else
{
hs.bkg.ave = read.csv(tnt.rate.file)
rownames(hs.bkg.ave)=paste(hs.bkg.ave$CHR,hs.bkg.ave$POSITION,sep=":")
}
if (!is.null(tnt.depth.file)) hs.depth=read.csv(tnt.depth.file,row.names = 1)
#create a Tri-nucleotide (tnt) code for each position
ref=toupper(hs.bkg.ave$REF)
n=length(ref)
tnt=c("",paste0(ref[-((n-1):n)],ref[-c(1,n)],ref[-(1:2)]),"")
# remove the tnt code for the position at sequence gaps
x=hs.bkg.ave$POSITION
x2=diff(x)
tnt[c(FALSE,x2!=1)]="" #remove tri-nucleotide after gap
tnt[c(x2!=1,FALSE)]="" #remove tri-nucleotide before gap
## Empirical Bayesian - a shrinkage approach
##################################################
#calculate the tri-nucleotide average and SD
tnt.bg.ave=aggregate(hs.bkg.ave[,s],list(TNT=tnt),function(x) mean(x[x<.1],na.rm=T)) # here we remove SNPs, which is defined as those with error rate>10%.
tnt.bg.sd=aggregate(hs.bkg.ave[,s],list(TNT=tnt),function(x) sd(x[x<0.1],na.rm=T))
# match the tnt average to the input data
loc.tnt.ave=tnt.bg.ave[match(tnt,tnt.bg.ave$TNT),-1]
loc.tnt.sd=tnt.bg.sd[match(tnt,tnt.bg.sd$TNT),-1]
w=loc.tnt.ave/(loc.tnt.ave+hs.bkg.ave[,s]) #shrinkage weight parameter
w[is.na(w)]=1
# average mutation error rate
shr.bg.ave=w*loc.tnt.ave+(1-w)*hs.bkg.ave[,s]
rownames(shr.bg.ave)=rownames(hs.bkg.ave)
#average depth matrix
if (exists("hs.depth")) depth.mat=matrix(rep(apply(hs.depth,1,mean),4),ncol=4) else
depth.mat=matrix(tnt.depth,nrow=nrow(pileup1.data),ncol=4)
# function to define BMER and call mutation for input file
polish.pred.3nt=function(bgrd.ave=shr.bg.ave, #average mutation rate data matrix used to define the background
polish.method="Binomial", # Background polish method: Poisson or Binomial
bgrd.depth=depth.mat, #average seq depth for background
pred.sample=file.path(data.dir,input.pileup.file[1]), #sample file name to be polished, should be a tab delimited pileup text file
alpha=input.alpha, #alpha = significance level, control polishing strength
filter=FALSE, #filter out those bases based on filtering criteria
polyclone=TRUE, # Output polyclone results, if FALSE, then only the base with the highest mutation rate is outputted
out.dir=NULL) #output directory. If null, polished data will be outputted to the same directory of input data
{ cat(paste("Polishing",pred.sample,"\n"))
pileup.data=read.table(file=pred.sample,header=T,check.names = F,as.is=T) #read in the data
rownames(pileup.data)=paste(pileup.data$CHR,pileup.data$POSITION,sep=":")
#missing base in the data if any
miss.pos=which(is.na(match(rownames(bgrd.ave),rownames(pileup.data))))
if (length(miss.pos)>0) {
miss.nt=pileup.data[rownames(bgrd.ave)[miss.pos],]
rownames(miss.nt)=rownames(bgrd.ave)[miss.pos]
miss.nt$CHR=substr(rownames(miss.nt),1,regexpr(":",rownames(miss.nt))-1)
miss.nt$POSITION=as.numeric(substr(rownames(miss.nt),regexpr(":",rownames(miss.nt))+1,nchar(rownames(miss.nt))))
miss.nt[is.na(miss.nt)]=0
miss.nt$DEPTH=1
miss.nt$REF=""
pileup.data=rbind(pileup.data,miss.nt)
pileup.data=pileup.data[match(rownames(bgrd.ave),rownames(pileup.data)),]
}
out.pileup=pileup.data #save a copy for output
#below is not a critical step. It changes the column name of forward strand to cap letter: A+ to A; and
#reverse strand to lower case letter A+ to a.
if (length(grep("\\+",names(pileup.data)))>0) names(pileup.data)=gsub("\\+","",names(pileup.data))
if (length(grep("-",names(pileup.data)))>0) {
rev.base.col=grep("-",names(pileup.data)) # data column has reverse string pile up
names(pileup.data)[rev.base.col]=gsub("-","",tolower(names(pileup.data)[rev.base.col]))
}
#change all reference letter acgt to upper case
pileup.data$REF=toupper(pileup.data$REF)
fs=match(s,names(pileup.data)) # columns with forward strand
bs=match(tolower(s),names(pileup.data)) # columns with reverse strand
pileup.data$fs.sum=apply(pileup.data[,fs],1,sum) #sum read count of FS
pileup.data$bs.sum=apply(pileup.data[,bs],1,sum) #sum read count of RS
pileup.data$mut.rate1 = pileup.data$fs.sum/(pileup.data$R+pileup.data$fs.sum)*100 #FS mutation rate in %
pileup.data$mut.rate2 = pileup.data$bs.sum/(pileup.data$r+pileup.data$bs.sum)*100 #RS mutation rate in %
pileup.data$mut.rate = (pileup.data$fs.sum+pileup.data$bs.sum)/pileup.data$DEPTH*100 #Total mutation rate in %
pileup.data$mut.F = apply(pileup.data[,fs],1,function(x) names(which.max(x))) #FS mutation letter
pileup.data$mut.R = apply(pileup.data[,bs],1,function(x) toupper(names(which.max(x)))) #RS mutation letter
pileup.data$mut.B = apply(pileup.data[,fs]+pileup.data[,bs],1,function(x) names(which.max(x))) #overall (FS+RS) mutation letter
pileup.data$mut.rateF = apply(pileup.data[,fs],1,max)/(pileup.data$R+pileup.data$fs.sum)*100 #FS specifc mutation rate
pileup.data$mut.rateR = apply(pileup.data[,bs],1,max)/(pileup.data$r+pileup.data$bs.sum)*100 #RS specifc mutation rate
pileup.data$mut.rate.adj=pileup.data$mut.rate
#set mutation rate to 0 if it is >10% (SNP) - removed in 7/10/17
#pileup.data$mut.rate.adj[pileup.data$mut.rate.adj>=10]=0
#sum FS and RS count for each base
d1= pileup.data[,c(rbind(s,tolower(s)))]
d1.2 = sapply(s,function(x) rowSums(d1[,toupper(names(d1))==x])) #sum mutation count
if (polish.method=="Binomial") d2=d1.2/pileup.data$DEPTH else d2=d1.2 # calculate mutation rate for the testing sample
#calculate the average count using input rate and depth
ave.read.count=bgrd.ave*bgrd.depth
#define background for poisson or binomial using upper CI limit
if (polish.method=="Poisson") {
d3=qchisq(1-alpha/2,2*ave.read.count+2)/2 } else {
d3=qbeta(1-alpha/2,as.matrix(ave.read.count)+1,as.matrix(bgrd.depth-ave.read.count))
}
#call mutation if observed data > background
pred=apply(d2>d3,1,sum)
if (polyclone) {pred.nts=apply(d2>d3,1,function(x) which(x>0)) # if polyclone results are desired, locate them for each base
clone.n=unlist(lapply(pred.nts,length))} else # number of clones
{ d2.2=d2
d2.2[d2<d3]=0
pred.nts=apply(d2.2,1,function(x) ifelse(max(x)>0,which.max(x),0))
clone.n=(pred.nts>0)+0
}
pred[pileup.data$mut.rate.adj==0]=0 #for those bases to be filtered out, we do not call.
#pred[apply(d1.2,1,max)<5]=0 #for those bases whose total counts<5, we do not call. removed 7/10/17
#polished noise data output
polish.noise=cbind(out.pileup,noise.num=apply(d1.2>0,1,sum),freq=apply(d1.2,1,max)/out.pileup$DEPTH,threshold=d3[cbind(1:nrow(d3),apply(d1.2,1,which.max))])[apply(d1.2,1,sum)>0 & pred==0,]
#output the polished data
nt.column.loc=match(c(rbind(s,tolower(s))),colnames(pileup.data)) #NT columns, used to be hard coded 7:14
out.pileup[pred==0,nt.column.loc]=0 # polish all positions without detected mutation
for (i in which(pred>0)) out.pileup[i,which(toupper(colnames(pileup.data)) %in% setdiff(s,names(unlist(pred.nts[[i]]))))]=0 # at position with mutation detection, polish the bases not having mutations
out.pileup$DEPTH=apply(out.pileup[,c(5:6,nt.column.loc)],1,sum) #re-calculate the depth after polish so they add up
#handling output file name and directory
out.file.name=input.file
out.file.name1=paste(out.file.name,".clean.txt",sep="")
out.file.name2=paste(out.file.name,".noise.txt",sep="")
write.table(out.pileup,file=out.file.name1,sep="\t",quote=F,row.names=F)
write.table(polish.noise,file=out.file.name2,sep="\t",quote=F,row.names=F)
return(pred)
}
# call the function to run each input sample
##############################################
n.sample=length(input.pileup.file)
pred3.all=matrix(0,ncol=n.sample,nrow=nrow(shr.bg.ave))
for (i in 1:n.sample) pred3.all[,i]=polish.pred.3nt(bgrd.ave = shr.bg.ave,
bgrd.depth=depth.mat,
pred.sample = input.pileup.file[i],
alpha=input.alpha,
polyclone=TRUE,
out.dir=work.dir)
# The resulted data pred3.all is an indicator matrix for each position(row) and sample (column)
# 1=mutation, 0= no mutation