-
Notifications
You must be signed in to change notification settings - Fork 76
/
Copy pathannotatePeak.R
318 lines (284 loc) · 12.1 KB
/
annotatePeak.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
##' Annotate peaks
##'
##'
##' @title annotatePeak
##' @param peak peak file or GRanges object
##' @param tssRegion Region Range of TSS
##' @param TxDb TxDb or EnsDb annotation object
##' @param level one of transcript and gene
##' @param assignGenomicAnnotation logical, assign peak genomic annotation or not
##' @param genomicAnnotationPriority genomic annotation priority
##' @param annoDb annotation package
##' @param addFlankGeneInfo logical, add flanking gene information from the peaks
##' @param flankDistance distance of flanking sequence
##' @param sameStrand logical, whether find nearest/overlap gene in the same strand
##' @param ignoreOverlap logical, whether ignore overlap of TSS with peak
##' @param ignoreUpstream logical, if True only annotate gene at the 3' of the peak.
##' @param ignoreDownstream logical, if True only annotate gene at the 5' of the peak.
##' @param overlap one of 'TSS' or 'all', if overlap="all", then gene overlap with peak will be reported as nearest gene, no matter the overlap is at TSS region or not.
##' @param verbose print message or not
##' @param columns names of columns to be obtained from database
##' @return data.frame or GRanges object with columns of:
##'
##' all columns provided by input.
##'
##' annotation: genomic feature of the peak, for instance if the peak is
##' located in 5'UTR, it will annotated by 5'UTR. Possible annotation is
##' Promoter-TSS, Exon, 5' UTR, 3' UTR, Intron, and Intergenic.
##'
##' geneChr: Chromosome of the nearest gene
##'
##' geneStart: gene start
##'
##' geneEnd: gene end
##'
##' geneLength: gene length
##'
##' geneStrand: gene strand
##'
##' geneId: entrezgene ID
##'
##' distanceToTSS: distance from peak to gene TSS
##'
##' if annoDb is provided, extra column will be included:
##'
##' ENSEMBL: ensembl ID of the nearest gene
##'
##' SYMBOL: gene symbol
##'
##' GENENAME: full gene name
##' @import BiocGenerics S4Vectors GenomeInfoDb
##' @import yulab.utils
##' @examples
##' \dontrun{
##' require(TxDb.Hsapiens.UCSC.hg19.knownGene)
##' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
##' peakfile <- system.file("extdata", "sample_peaks.txt", package="ChIPseeker")
##' peakAnno <- annotatePeak(peakfile, tssRegion=c(-3000, 3000), TxDb=txdb)
##' peakAnno
##' }
##' @seealso \code{\link{plotAnnoBar}} \code{\link{plotAnnoPie}} \code{\link{plotDistToTSS}}
##' @export
##' @author G Yu
annotatePeak <- function(peak,
tssRegion=c(-3000, 3000),
TxDb=NULL,
level = "transcript",
assignGenomicAnnotation=TRUE,
genomicAnnotationPriority = c("Promoter", "5UTR", "3UTR", "Exon", "Intron", "Downstream", "Intergenic"),
annoDb=NULL,
addFlankGeneInfo=FALSE,
flankDistance=5000,
sameStrand = FALSE,
ignoreOverlap=FALSE,
ignoreUpstream=FALSE,
ignoreDownstream=FALSE,
overlap = "TSS",
verbose=TRUE,
columns=c("ENTREZID", "ENSEMBL", "SYMBOL", "GENENAME")) {
is_GRanges_of_TxDb <- FALSE
if (is(TxDb, "GRanges")) {
is_GRanges_of_TxDb <- TRUE
assignGenomicAnnotation <- FALSE
annoDb <- NULL
addFlankGeneInfo <- FALSE
message("#\n#.. 'TxDb' is a self-defined 'GRanges' object...\n#")
message("#.. Some parameters of 'annotatePeak' will be disable,")
message("#.. including:")
message("#..\tlevel, assignGenomicAnnotation, genomicAnnotationPriority,")
message("#..\tannoDb, addFlankGeneInfo and flankDistance.")
message("#\n#.. Some plotting functions are designed for visualizing genomic annotation")
message("#.. and will not be available for the output object.\n#")
}
if (is_GRanges_of_TxDb) {
level <- "USER_DEFINED"
} else {
level <- match.arg(level, c("transcript", "gene"))
}
if (assignGenomicAnnotation && all(genomicAnnotationPriority %in% c("Promoter", "5UTR", "3UTR", "Exon", "Intron", "Downstream", "Intergenic")) == FALSE) {
stop('genomicAnnotationPriority should be any order of c("Promoter", "5UTR", "3UTR", "Exon", "Intron", "Downstream", "Intergenic")')
}
if ( is(peak, "GRanges") ){
## this test will be TRUE
## when peak is an instance of class/subclass of "GRanges"
input <- "gr"
peak.gr <- peak
} else {
input <- "file"
peak.gr <- loadPeak(peak, verbose)
}
peakNum <- length(peak.gr)
if (verbose)
cat(">> preparing features information...\t\t",
format(Sys.time(), "%Y-%m-%d %X"), "\n")
if (is_GRanges_of_TxDb) {
features <- TxDb
} else {
TxDb <- loadTxDb(TxDb)
if (level=="transcript") {
features <- getGene(TxDb, by="transcript")
} else {
features <- getGene(TxDb, by="gene")
}
}
if (verbose)
cat(">> identifying nearest features...\t\t",
format(Sys.time(), "%Y-%m-%d %X"), "\n")
## nearest features
idx.dist <- getNearestFeatureIndicesAndDistances(peak.gr, features,
sameStrand, ignoreOverlap,
ignoreUpstream,ignoreDownstream,
overlap=overlap)
if (verbose)
cat(">> calculating distance from peak to TSS...\t",
format(Sys.time(), "%Y-%m-%d %X"), "\n")
## distance
distance <- idx.dist$distance
## update peak, remove un-map peak if exists.
peak.gr <- idx.dist$peak
## annotation
if (assignGenomicAnnotation == TRUE) {
if (verbose)
cat(">> assigning genomic annotation...\t\t",
format(Sys.time(), "%Y-%m-%d %X"), "\n")
anno <- getGenomicAnnotation(peak.gr, distance, tssRegion, TxDb, level, genomicAnnotationPriority, sameStrand=sameStrand)
annotation <- anno[["annotation"]]
detailGenomicAnnotation <- anno[["detailGenomicAnnotation"]]
} else {
annotation <- NULL
detailGenomicAnnotation <- NULL
}
## append annotation to peak.gr
if (!is.null(annotation))
mcols(peak.gr)[["annotation"]] <- annotation
has_nearest_idx <- which(idx.dist$index <= length(features))
nearestFeatures <- features[idx.dist$index[has_nearest_idx]]
## duplicated names since more than 1 peak may annotated by only 1 gene
names(nearestFeatures) <- NULL
nearestFeatures.df <- as.data.frame(nearestFeatures)
if (is_GRanges_of_TxDb) {
colnames(nearestFeatures.df)[1:5] <- c("geneChr", "geneStart", "geneEnd",
"geneLength", "geneStrand")
} else if (level == "transcript") {
if (is(TxDb, "EnsDb")) {
nearestFeatures.df <- nearestFeatures.df[, c("seqnames", "start",
"end", "width",
"strand", "gene_id",
"tx_id", "tx_biotype"),
drop = FALSE]
colnames(nearestFeatures.df) <- c(
"geneChr", "geneStart", "geneEnd", "geneLength", "geneStrand",
"geneId", "transcriptId", "transcriptBiotype")
} else {
colnames(nearestFeatures.df) <- c("geneChr", "geneStart", "geneEnd",
"geneLength", "geneStrand",
"geneId", "transcriptId")
nearestFeatures.df$geneId <- TXID2EG(
as.character(nearestFeatures.df$geneId), geneIdOnly=TRUE)
}
} else {
if (is(TxDb, "EnsDb")) {
nearestFeatures.df <- nearestFeatures.df[, c("seqnames", "start",
"end", "width",
"strand", "gene_id",
"gene_biotype"),
drop = FALSE]
colnames(nearestFeatures.df) <- c("geneChr", "geneStart", "geneEnd",
"geneLength", "geneStrand",
"geneId", "geneBiotype")
} else
colnames(nearestFeatures.df) <- c("geneChr", "geneStart", "geneEnd",
"geneLength", "geneStrand",
"geneId")
}
for(cn in colnames(nearestFeatures.df)) {
mcols(peak.gr)[[cn]][has_nearest_idx] <- unlist(nearestFeatures.df[, cn])
}
mcols(peak.gr)[["distanceToTSS"]] <- distance
if (!is.null(annoDb)) {
if (verbose)
cat(">> adding gene annotation...\t\t\t",
format(Sys.time(), "%Y-%m-%d %X"), "\n")
.idtype <- IDType(TxDb)
if (length(.idtype) == 0 || is.na(.idtype) || is.null(.idtype)) {
n <- length(peak.gr)
if (n > 100)
n <- 100
sampleID <- peak.gr$geneId[1:n]
if (all(grepl('^ENS', sampleID))) {
.idtype <- "Ensembl Gene ID"
} else if (all(grepl('^\\d+$', sampleID))) {
.idtype <- "Entrez Gene ID"
} else {
warning("Unknown ID type, gene annotation will not be added...")
.idtype <- NA
}
}
if (!is.na(.idtype)) {
peak.gr %<>% addGeneAnno(annoDb, .idtype, columns)
}
}
if (addFlankGeneInfo == TRUE) {
if (verbose)
cat(">> adding flank feature information from peaks...\t",
format(Sys.time(), "%Y-%m-%d %X"), "\n")
flankInfo <- getAllFlankingGene(peak.gr, features, level, flankDistance)
if (level == "transcript") {
mcols(peak.gr)[["flank_txIds"]] <- NA
mcols(peak.gr)[["flank_txIds"]][flankInfo$peakIdx] <- flankInfo$flank_txIds
}
mcols(peak.gr)[["flank_geneIds"]] <- NA
mcols(peak.gr)[["flank_gene_distances"]] <- NA
mcols(peak.gr)[["flank_geneIds"]][flankInfo$peakIdx] <- flankInfo$flank_geneIds
mcols(peak.gr)[["flank_gene_distances"]][flankInfo$peakIdx] <- flankInfo$flank_gene_distances
}
if (!is_GRanges_of_TxDb) {
if(verbose)
cat(">> assigning chromosome lengths\t\t\t",
format(Sys.time(), "%Y-%m-%d %X"), "\n")
peak.gr@seqinfo <- seqinfo(TxDb)[names(seqlengths(peak.gr))]
}
if(verbose)
cat(">> done...\t\t\t\t\t",
format(Sys.time(), "%Y-%m-%d %X"), "\n")
if (assignGenomicAnnotation) {
res <- new("csAnno",
anno = peak.gr,
tssRegion = tssRegion,
level=level,
hasGenomicAnnotation = TRUE,
detailGenomicAnnotation=detailGenomicAnnotation,
annoStat=getGenomicAnnoStat(peak.gr),
peakNum=peakNum
)
} else {
res <- new("csAnno",
anno = peak.gr,
tssRegion = tssRegion,
level=level,
hasGenomicAnnotation = FALSE,
peakNum=peakNum
)
}
if (yulab.utils:::.hi("virusPlot")) return("hi")
return(res)
}
##' dropAnno
##'
##' drop annotation exceeding distanceToTSS_cutoff
##' @title dropAnno
##' @param csAnno output of annotatePeak
##' @param distanceToTSS_cutoff distance to TSS cutoff
##' @return csAnno object
##' @export
##' @author Guangchuang Yu
dropAnno <- function(csAnno, distanceToTSS_cutoff=10000) {
idx <- which(abs(mcols(csAnno@anno)[["distanceToTSS"]]) < distanceToTSS_cutoff)
csAnno@anno <- csAnno@anno[idx]
csAnno@peakNum <- length(idx)
if (csAnno@hasGenomicAnnotation) {
csAnno@annoStat <- getGenomicAnnoStat(csAnno@anno)
csAnno@detailGenomicAnnotation = csAnno@detailGenomicAnnotation[idx,]
}
csAnno
}