-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpeakMerge.py
383 lines (352 loc) · 18 KB
/
peakMerge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
# %%
import numpy as np
from scipy.signal import argrelextrema, oaconvolve
from scipy.signal.windows import gaussian
from scipy.io import mmwrite
from scipy.sparse import csr_matrix
import seaborn as sns
import pandas as pd
import argparse
import os
import sys
class peakMerger:
"""
peakMerger
Find consensus peaks from multiple peak calling files, and has methods
to perform data exploration analyses.
Parameters
----------
genomeFile: string
Path to tab separated (chromosome, length) annotation file.
scoreMethod: "binary" or integer (optional, default "binary")
If set to binary, will use a binary matrix (peak is present or absent).
Otherwise if set to an integer this will be the column index (0-based)
used as score. Usually not recommended but the code has fallbacks
for the non-binary case.
outputPath: string (optional, default None)
Folder to export results (matrix, plots...). If not specified will not
write any results.
Attributes
-----------------
matrix: ndarray of shape (num consensuses, num experiments)
The experiment-consensus peak presence matrix. It is
a binary matrix unless the score method has been changed.
consensuses: pandas DataFrame in bed-like format
The genomic locations matching the rows (consensuses) of
the matrix attribute.
labels: list of strings of size (num experiments)
The file names matching the columns (experiments) of
the matrix attribute.
"""
def __init__(self, genomeFile, outputPath=None, scoreMethod="binary"):
self.score = scoreMethod
self.outputPath = outputPath
try:
os.mkdir(self.outputPath)
except:
pass
if not scoreMethod == "binary":
self.score = int(scoreMethod)
with open(genomeFile, 'r') as f:
self.chrLen = dict()
self.genomeSize = 0
for l in f:
l = l.rstrip("\n").split("\t")
self.chrLen[l[0]] = int(l[1])
self.genomeSize += int(l[1])
self.matrix = None
self.consensuses = None
self.labels = None
def mergePeaks(self, folderPath, fileFormat, inferCenter=False, forceUnstranded=False,
sigma="auto_1", perPeakDensity=False, perPeakMultiplier=0.5,
minOverlap=2):
"""
Read peak called files, generate consensuses and the matrix.
Parameters
----------
folderPath: string
Path to either folder with ONLY the peak files or comma separated
list of files. Accepts compressed files.
fileFormat: "bed" or "narrowPeak"
Format of the files being read.
Bed file format assumes the max signal position to be at the 6th column
(0-based) in absolute coordinates.
The narrowPeak format assumes the max signal position to be the 9th column
with this position being relative to the start position.
inferCenter: boolean (optional, default False)
If set to true will use the position halfway between start and end
positions. Enable this only if the summit position is missing.
forceUnstranded: Boolean (optional, default False)
If set to true, assumes all peaks are not strand-specific.
sigma: float or "auto_1" or "auto_2" (optional, default "auto_1")
Size of the gaussian filter (lower values = more separation).
Only effective if perPeakDensity is set to False.
"auto_1" automatically selects the filter width at 1/8th of the average peak size.
"auto_2" automatically selects the filter width based on the peak
count and genome size.
perPeakDensity: Boolean (optional, default False)
If set to false will perform a gaussian filter along the genome (faster),
assuming all peaks have roughly the same size.
If set to true will create the density curve per peak based on each peak
individual size. This is much more slower than the filter method.
May be useful if peaks are expected to have very different sizes. Can
also be faster when the number of peaks is small.
perPeakMultiplier: float (optional, default 0.25)
Only effective if perPeakDensity is set to True. Adjusts the width of
the gaussian fitted to each peak (lower values = more separation).
minOverlap: integer (optional, default 2)
Minimum number of peaks required at a consensus.
"""
alltabs = []
if os.path.isdir(folderPath):
files = os.listdir(folderPath)
else:
files = folderPath.split(",")
for f in files:
fmt = fileFormat
if not fmt in ["bed", "narrowPeak"]:
raise TypeError(f"Unknown file format : {fmt}")
# Read bed format
if fmt == "bed":
if inferCenter:
usedCols = [0,1,2,5]
else:
usedCols = [0,1,2,5,6]
if not self.score == "binary":
usedCols.append(self.score)
tab = pd.read_csv(folderPath + "/" + f, sep="\t", header=None,
usecols=usedCols)
tab = tab[usedCols]
if self.score == "binary":
tab[5000] = 1
tab.columns = np.arange(len(tab.columns))
tab[0] = tab[0].astype("str", copy=False)
tab[3].fillna(value=".", inplace=True)
if inferCenter:
tab[5] = tab[4]
tab[4] = ((tab[1]+tab[2])*0.5).astype(int)
tab[6] = f
if self.score == "binary":
tab[5] = [1]*len(tab)
alltabs.append(tab)
elif fmt == "narrowPeak":
if inferCenter:
usedCols = [0,1,2,5]
else:
usedCols = [0,1,2,5,9]
if not self.score == "binary":
usedCols.append(self.score)
tab = pd.read_csv(folderPath + "/" + f, sep="\t", header=None,
usecols=usedCols)
tab = tab[usedCols]
if self.score == "binary":
tab[5000] = 1
tab.columns = np.arange(len(tab.columns))
tab[0] = tab[0].astype("str", copy=False)
tab[3].fillna(value=".", inplace=True)
if inferCenter:
tab[5] = tab[4]
tab[4] = ((tab[1]+tab[2])*0.5).astype(int, copy=False)
else:
tab[4] = (tab[1] + tab[4]).astype(int, copy=False)
tab[6] = f
alltabs.append(tab)
# Concatenate files
self.df = pd.concat(alltabs)
# self.df[[0,1,2]].to_csv("h3k27ac_output/allPeaks.bed", sep="\t", header=False, index=False)
self.numElements = len(self.df)
self.avgPeakSize = np.mean(self.df[2] - self.df[1])
# Check strandedness
if forceUnstranded == True:
self.df[3] = "."
self.strandCount = 1
else:
# Check if there is only stranded or non-stranded elements
strandValues = np.unique(self.df[3])
self.strandCount = len(strandValues)
if self.strandCount > 2:
raise ValueError("More than two strand directions !")
elif self.strandCount == 2 and "." in strandValues:
raise ValueError("Unstranded and stranded values !")
# Factorize experiments
self.df[6], self.labels = pd.factorize(self.df[6])
# Split per strand
self.df = dict([(k, x) for k, x in self.df.groupby(3)])
########### Peak separation step ###########
# Compute sigma if automatic setting
if sigma == "auto_1":
sigma = self.avgPeakSize/8.0
elif sigma == "auto_2":
l = self.genomeSize/self.numElements*self.strandCount
sigma = np.log(10.0) * l / np.sqrt(2.0)
else:
sigma = float(sigma)
if perPeakDensity:
sigma = perPeakMultiplier
windowSize = int(8*sigma)+1
sepPerStrand = {}
sepIdxPerStrand = {}
# Iterate for each strand
for s in self.df.keys():
# Split peaks per chromosome
posPerChr = dict([(k, x.values[:, [1,2,4]].astype(int)) for k, x in self.df[s].groupby(0)])
# Iterate over all chromosomes
sepPerStrand[s] = {}
sepIdxPerStrand[s] = {}
for chrName in posPerChr.keys():
# Place peak on the genomic array
try:
currentLen = self.chrLen[str(chrName)]
except KeyError:
print(f"Warning: chromosome {str(chrName)} is not in genome annotation and will be removed")
continue
array = np.zeros(currentLen, dtype="float32")
peakIdx = posPerChr[chrName]
np.add.at(array, peakIdx[:, 2],1)
if not perPeakDensity:
# Smooth peak density
smoothed = oaconvolve(array, gaussian(windowSize, sigma), "same")
# Split consensuses
separators = argrelextrema(smoothed, np.less_equal)[0] # Get local minimas
else:
smoothed = np.zeros(currentLen, dtype="float32")
for i in range(len(peakIdx)):
peakSigma = (peakIdx[i, 1] - peakIdx[i, 0])*sigma
windowSize = int(8*peakSigma)+1
start = max(peakIdx[i, 2] - int(windowSize/2), 0)
end = min(peakIdx[i, 2] + int(windowSize/2) + 1, currentLen)
diffStart = max(-peakIdx[i, 2] + int(windowSize/2), 0)
diffEnd = windowSize + min(currentLen - peakIdx[i, 2] - int(windowSize/2) - 1, 0)
smoothed[start:end] += gaussian(windowSize, peakSigma)[diffStart:diffEnd]
separators = argrelextrema(smoothed, np.less_equal)[0] # Get local minimas
separators = separators[np.where(np.ediff1d(separators) != 1)[0]+1] # Removes consecutive separators (because less-equal comparison)
separators = np.insert(separators, [0,len(separators)], [0, currentLen]) # Add start and end points
# Genome position separators
sepPerStrand[s][chrName] = separators
# Peak index separator
array = array.astype("int16", copy=False)
sepIdxPerStrand[s][chrName] = np.cumsum([np.sum(array[separators[i]: separators[i+1]]) for i in range(len(separators)-1)], dtype="int64")
del array
########### Create matrix and consensus genomic locations ###########
self.matrix = []
self.consensuses = []
j = 0
# Iterate over each strand
for s in self.df.keys():
self.df[s].sort_values(by=[0, 4], inplace=True)
posPerChr = dict([(k, x.values) for k, x in self.df[s].groupby(0)])
# Iterate over each chromosome
for chrName in posPerChr.keys():
try:
separators = sepPerStrand[s][chrName]
except:
continue
splits = np.split(posPerChr[chrName], sepIdxPerStrand[s][chrName])
for i in range(len(splits)):
currentConsensus = splits[i]
# Exclude consensuses that are too small
if len(currentConsensus) < minOverlap:
continue
currentSep = separators[i:i+2]
# Setup consensuses coordinates
consensusStart = max(np.min(currentConsensus[:,1]), currentSep[0])
consensusEnd = min(np.max(currentConsensus[:,2]), currentSep[1])
consensusCenter = int(np.mean(currentConsensus[:,4]))
# Mean value of present features
meanScore = len(currentConsensus)
# Assign scores for each experiment to the current consensus
features = np.zeros(len(self.labels), dtype="float32")
features[currentConsensus[:, 6].astype(int)] = currentConsensus[:, 5].astype(float)
# Add consensus to the score matrix and to the genomic locations
self.matrix.append(features)
data = [chrName, consensusStart, consensusEnd, j,
meanScore, s, consensusCenter, consensusCenter + 1]
self.consensuses.append(data)
j += 1
self.matrix = np.array(self.matrix)
if self.score == "binary":
self.matrix = self.matrix.astype(bool)
self.consensuses = pd.DataFrame(self.consensuses)
def writePeaks(self):
"""
Write matrix, datasets names and consensuses genomic locations.
The matrix (consensuses, datasets) uses a sparse matrix market format
and is saved into "matrix.mtx".
The dataset names corresponding to the rows are saved in "datasets.txt".
The genomic locations associated to each consensus are located in "consensuses.bed".
Parameters
----------
"""
self.consensuses.to_csv(self.outputPath + "consensuses.bed",
sep="\t", header=False, index=False)
mmwrite(self.outputPath + "matrix.mtx", csr_matrix(self.matrix.astype(float)))
pd.DataFrame(self.labels).to_csv(self.outputPath + "datasets.txt",
sep="\t", header=False, index=False)
with open(self.outputPath + "dataset_stats.txt", "w") as f:
f.write("Average peak size\t" + str(self.avgPeakSize) + "\n")
f.write("Number of Peaks\t" + str(self.numElements) + "\n")
f.write("Number of consensus peaks\t" + str(self.matrix.shape[0]) + "\n")
f.write("Number of experiments\t" + str(self.matrix.shape[1]) + "\n")
f.write("Genome size\t" + str(self.genomeSize) + "\n")
if __name__ == "__main__":
# Parse command line arguments
parser = argparse.ArgumentParser(description="A simple tool to integrate and analyse peak called experiments. Use the python API for more options and a more flexible usage.")
parser.add_argument("genomeFile", help="Path to tab separated (chromosome, length) annotation file.", type=str)
parser.add_argument("folderPath", help="Path to either folder with ONLY the peak files or comma separated list of files. Accepts compressed files.", type=str)
parser.add_argument("fileFormat", help=
"""
"bed" or "narrowPeak"
Format of the files being read.
Bed file format assumes the max signal column to be at the 6th column
(0-based) in absolute coordinates.
The narrowPeak format assumes this to be the 9th column with this number
being relative to the start position.""", type=str)
parser.add_argument("outputFolder", help="""
Path of a folder (preferably empty) to write into. Will attempt to create the folder
if it does not exists.
""", type=str)
parser.add_argument("--forceUnstranded", help=
"""
If enabled, assumes all peaks are not strand-specific.
""", action="store_true")
parser.add_argument("--inferCenter", action="store_true",
help="If enabled will use the position halfway between start and end positions as summit. Enable this only if the summit position is missing.")
parser.add_argument("--sigma", help=
"""
float or "auto_1" or "auto_2" (optional, default "auto_1")
Size of the gaussian filter for the peak separation
(lower values = more separation).
Only effective if perPeakDensity is set to False.
"auto_1" automatically selects the filter width based on the average
peak size.
"auto_2" automatically selects the filter width based on the peak
count and genome size.
""", default="auto_1")
parser.add_argument("--scoreMethod",
help="""
"binary" or integer (default "binary")
If set to binary, will use a binary matrix (peak is present or absent).
Otherwise if set to an integer this will be the column index (0-based)
used as score.""", default="binary")
parser.add_argument("--minOverlap", help=
"""
integer (optional, default 2)
Minimum number of peaks required at a consensus.
2 is the default but 1 can be used on smaller datasets.
""", type=int, default=2)
args = parser.parse_args()
try:
os.mkdir(args.outputFolder)
except:
pass
with open(args.outputFolder + "command.sh", "w") as f:
f.write(" ".join(sys.argv))
# Run analyses
merger = peakMerger(args.genomeFile, outputPath=args.outputFolder,
scoreMethod=args.scoreMethod)
merger.mergePeaks(args.folderPath, forceUnstranded=args.forceUnstranded,
sigma=args.sigma, inferCenter=args.inferCenter,
fileFormat=args.fileFormat, minOverlap=args.minOverlap)
merger.writePeaks()
print(f"Got a matrix of {merger.matrix.shape[0]} consensuses and {merger.matrix.shape[1]} experiments")
sys.exit()