forked from drmrgd/ocp_tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcollate_moi_reports.py
executable file
·369 lines (329 loc) · 12.1 KB
/
collate_moi_reports.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Run MATCH MOI Reporter on a VCF file and generate a CSV with result that can
# be imported into Excel for a collated report
#
# 2/11/2014
################################################################################
"""
Collect MOI reports for a set of VCF files and output a raw CSV formatted output
that can be easily imported into Microsoft Excel for reporting. This script
relies on `match_moi_report.pl` in order to generate the MOI reports for each.
"""
import sys
import os
import re
import subprocess
import argparse
import multiprocessing
from termcolor import colored
from natsort import natsorted
from collections import defaultdict
from pprint import pprint as pp # noqa
from multiprocessing.pool import ThreadPool # noqa
version = '4.2.061019'
debug = False
def get_args():
# Default thresholds. Put them here rather than fishing below.
num_procs = multiprocessing.cpu_count() - 1
cn = 7
cu = None
cl = None
reads = 1000
parser = argparse.ArgumentParser(description = __doc__)
parser.add_argument(
'vcf_files',
metavar="<vcf_files>",
nargs="+",
help="List of VCF files to process."
)
parser.add_argument(
'--cn',
metavar='INT',
default = cn,
type = int,
help='Use copy number (CN) value for CNV reporting to be more compatible '
'with MATCH rules. This will disable CU and CL thresholds and is on '
'by default. %s' % colored("DEFAULT: CN=%(default)s", 'green')
)
parser.add_argument(
'--cu',
default=cu,
metavar='INT',
type = int,
help='Copy number threshold ({} CI lower bound) to pass to '
'match_moi_report for reporting amplifications. {}'.format(
'5%%', colored('DEFAULT: 5%% CI=%(default)s', 'green'))
)
parser.add_argument(
'--cl',
default=cl,
metavar='INT',
type = int,
help='Copy number threshold ({} CI upper bound) to pass to '
'match_moi_report for reporting copy loss. {}'.format(
'95%%', colored('DEFAULT: 95%% CI=%(default)s', 'green'))
)
parser.add_argument(
'--reads',
default=reads,
metavar='INT',
type = int,
help='Threshold for number of fusion reads to report. %s' %
colored('DEFAULT: Reads=%(default)s', 'green')
)
parser.add_argument(
'-p', '--pedmatch',
action='store_true',
help='Data comes from Pediatric MATCH rather than Adult MATCH.'
)
parser.add_argument(
'-b', '--blood',
action='store_true',
help='Data comes from blood specimens, and therefore we only have DNA '
'data.'
)
parser.add_argument(
'-n', '--num_procs',
metavar="INT <num_procs>",
type=int,
default=num_procs,
help='Number of thread pools to use. {}'.format(
colored('DEFAULT: %(default)s procs', 'green'))
)
parser.add_argument(
'-o','--output',
metavar="<output file>",
help='Output to file rather than STDOUT.'
)
parser.add_argument(
'-q','--quiet',
action='store_false',
default=True,
help='Do not suppress warning and extra output'
)
parser.add_argument(
'-v', '--version',
action='version',
version = '%(prog)s - ' + version
)
args = parser.parse_args()
if any(x for x in (args.cu, args.cl)):
args.cn = None
if not all(x for x in (args.cu, args.cl)):
sys.stderr.write("ERROR: you must indicate both a 5% *and* 95% CI "
"value when using the --cu and --cl option.\n")
sys.exit(1)
global quiet
quiet = args.quiet
return args
def get_names(string):
string = os.path.basename(string)
try:
(dna_samp,rna_samp) = re.search('(.*?DNA)_(.*).vcf$',string).group(1,2)
except:
if not quiet:
sys.stderr.write("WARN: Can not get DNA or RNA sample name for '%s'! "
"Using full VCF filename instead\n" % string)
dna_samp = rna_samp = string.rstrip('.vcf')
return dna_samp, rna_samp
def parse_cnv_params(cu, cl, cn):
'''
Since CNV params are a bit difficult to work with, create a better,
standardized list to pass into functions below
'''
params_list = []
params = {
'--cu' : cu,
'--cl' : cl,
'--cn' : cn
}
params_list = [[k, str(v)] for k, v in params.items() if v]
return sum(params_list, [])
def populate_list(var_type, var_data):
wanted_fields = {
'snv' : [0,9,1,2,3,10,11,12,8,4,5,6,7,13],
'cnv' : [0,1,2,5],
'fusions' : [0,4,2,1,3]
}
return [var_data[x] for x in wanted_fields[var_type]]
def pad_list(data_list, data_type):
'''
Pad out the list with hyphens where there is no relevent data. Maybe kludgy,
but I don't know a better way
'''
tmp_list = ['-'] * 15
data_list.reverse()
if data_type == 'cnv':
for i in [0,1,2,9]:
tmp_list[i] = data_list.pop()
elif data_type == 'fusions':
# Get rid of
for i in [0,1,8,4,10]:
tmp_list[i] = data_list.pop()
return tmp_list
def get_location(pos, vcf):
"""
For the cases where we need location information, like Protein Painter,
re-read the VCF in vcfExtractor, and get the location for the output.
"""
cmd = ['vcfExtractor.pl', '-N', '-n', '-a', '-p', pos, vcf]
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
encoding='utf8')
result, error = p.communicate()
if p.returncode != 0:
sys.stderr.write("ERROR: Can not process file: {}!\n".format(vcf))
lines = result.split('\n')
for l in lines:
if l.startswith('chr'):
return l.split()[12] # Field 12 is "Location" field in vcfExtractor
def parse_data(report_data, dna, rna, vcf):
data = defaultdict(dict)
raw_data = report_data.split('\n')
for line in raw_data:
fields = line.split(',')
if fields[0] == 'SNV':
varid = fields[9] +':'+ fields[1]
data['snv_data'][varid] = [dna] + populate_list('snv', fields)
# For protein painter kind of output, need the location
data['snv_data'][varid].append(get_location(fields[1], vcf))
elif fields[0] == 'CNV':
varid = fields[1] +':'+ fields[2]
cnv_data = populate_list('cnv', fields)
padded_list = pad_list(cnv_data,'cnv')
data['cnv_data'][varid] = [dna] + padded_list
elif fields[0] == 'Fusion':
varid = fields[1] +':'+ fields[2]
fusion_data = populate_list('fusions', fields)
padded_list = pad_list(fusion_data,'fusions')
data['fusion_data'][varid] = [rna] + padded_list
# Let's still output something even if no MOIs were detected
if not data:
data['null']['no_result'] = [dna] + ['-']*11
return dict(data)
def print_data(var_type,data,outfile):
# Split the key by a colon and sort based on chr and then pos using the
# natsort library
if var_type == 'null':
outfile.write(','.join(data['no_result']) + "\n")
elif var_type == 'snv_data':
for variant in natsorted(
data.keys(), key=lambda k: (k.split(':')[1], k.split(':')[2])):
outfile.write(','.join(data[variant]) + "\n")
else:
for variant in natsorted(data.keys(), key=lambda k: k.split(':')[1]):
outfile.write(','.join(data[variant]) + "\n")
return
def print_title(fh, cu, cl, cn, reads, pedmatch):
'''Print out a header to remind me just what params I used this time!'''
cnv_params = parse_cnv_params(cu, cl, cn)
#string_params = '='.join([str(x).lstrip('--') for x in cnv_params])
string_params = '='.join([x.lstrip('--') for x in cnv_params])
if 'cl' in string_params:
string_params = string_params.replace('=cl','; cl')
if pedmatch:
study_name = 'Pediatric MATCH'
else:
study_name = 'Adult MATCH'
fh.write('-'*95)
fh.write('\nCollated {} MOI Reports Using Params CNV: {}, Fusion Reads: '
'reads={}\n'.format(study_name, string_params, reads))
fh.write('-'*95)
fh.write('\n')
return
def gen_moi_report(vcf, params, proc_type):
'''
Use MATCH MOI Reporter to generate a variant table we can parse later. Gen
CLI Opts to determine what params to run match_moi_report with
'''
(dna, rna) = get_names(vcf)
moi_report_cmd = ['match_moi_report.pl'] + params + [vcf]
p = subprocess.Popen(moi_report_cmd, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, encoding='utf8')
result, error = p.communicate()
if p.returncode != 0:
sys.stderr.write("ERROR: Can not process file: {}!\n".format(vcf))
raise Exception(error)
else:
# need a tuple to track threads and not crash dict entries if we're
# doing multithreaded processing.
if proc_type == 'single':
return parse_data(result, dna, rna, vcf)
elif proc_type == 'threaded':
return vcf, parse_data(result, dna, rna, vcf)
def arg_star(args):
return gen_moi_report(*args)
def proc_vcfs(vcf_files, params, num_procs):
'''
Process the input VCF files using the thresholds set in `params`. Will
either fork to a parallel process (if num_procs > 1) or process in a single
loop.
'''
moi_data = defaultdict(dict)
if num_procs < 2:
sys.stderr.write("Non-parallel processing files (total: %s VCF(s))\n" %
str(len(vcf_files)))
for v in vcf_files:
moi_data[v] = gen_moi_report(v, params, 'single')
else:
sys.stderr.write("Parallel processing files using %s processes (total: "
"%s VCF(s))\n" % (num_procs, str(len(vcf_files))))
task_list = [(v, params, 'threaded') for v in vcf_files]
pool = ThreadPool(num_procs)
try:
moi_data = {vcf : data for vcf, data in pool.imap_unordered(
arg_star, task_list)}
except Exception:
pool.close()
pool.join()
raise
except KeyboardInterrupt:
pool.close()
pool.join()
sys.exit(9)
# TODO: Not sure if this is needed or not....seem to have a memory leak!
pool.close()
pool.join()
return moi_data
def main(vcfs, cn, cu, cl, reads, pedmatch, blood, output, num_procs, quiet):
# Setup an output file if we want one
outfile = ''
if output:
print("Writing output to '%s'" % output)
outfile = open(output, 'w')
else:
outfile = sys.stdout
# Setup MOI Reporter args; start with CNV pipeline args
moi_reporter_args = parse_cnv_params(cu, cl, cn)
# Determine if blood or tumor and add appropriate thresholds
if not blood:
moi_reporter_args += ['-r', str(reads), '-R']
else:
moi_reporter_args.extend(['-R', '-b'])
# If Pediatric MATCH need to pass different MOI rules.
if pedmatch:
moi_reporter_args.append('-p')
moi_data = proc_vcfs(vcfs, moi_reporter_args, num_procs)
# Print data
print_title(outfile, cu, cl, cn, reads, pedmatch)
sys.exit()
header = ['Sample', 'Type', 'Gene', 'Position', 'Ref', 'Alt',
'Transcript', 'CDS', 'AA', 'VARID', 'VAF/CN', 'Coverage/Counts',
'RefCov', 'AltCov', 'Function', 'Location']
outfile.write(','.join(header) + "\n")
# Print out sample data by VCF
var_types = ['snv_data', 'cnv_data', 'fusion_data', 'null']
for sample in sorted(moi_data):
for var_type in var_types:
try:
print_data(var_type, moi_data[sample][var_type], outfile)
except KeyError:
continue
if __name__ == '__main__':
args = get_args()
if debug:
print('CLI args as passed:')
pp(vars(args))
print('')
main(args.vcf_files, args.cn, args.cu, args.cl, args.reads, args.pedmatch,
args.blood, args.output, args.num_procs, args.quiet)