-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbsnp.py
executable file
·166 lines (128 loc) · 6.62 KB
/
bsnp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/env python
#######################################
# bsnp.py
# v1.1
# 3 nov 2015
#
# volker hovestadt
# german cancer research center
# v.hovestadt@dkfz.de
#
#
# extract allele frequencies from alignment file for in-silico genotyping (fingerprinting).
# returns base counts for ref and alt and betavalue-like allele frequency.
#
# automatically detects if contigs are named "chr1" or "1" in alignment file.
# in strand-specific WGBS sequencing data only the strand not affected by conversion is considered.
def mod_bsnp(sysargv):
import sys
import pysam
import argparse
import datetime
def nicetime(): return datetime.datetime.now().strftime("[bsnp %Y-%m-%d %H:%M:%S]")
#######################################
# arguments, filehandles
parser = argparse.ArgumentParser(prog="methylCtools bsnp", version="1.0", description="extract allele frequencies from alignment file for in-silico genotyping (fingerprinting).")
parser.add_argument("-s", "--silent", dest="qf", action="store_false", help="do not show status messages")
parser.add_argument("-q", "--mapQ", metavar="INT", dest="mapQ", type=int, default=1, action="store", help="skip alignments with mapQ smaller than INT [1]")
parser.add_argument("-Q", "--baseQ", metavar="INT", dest="baseQ", type=int, default=20, action="store", help="skip bases with baseQ smaller than INT (Phred+33) [20]")
parser.add_argument("-c", "--cov", metavar="INT", dest="minCOV", type=int, default=0, action="store", help="skip positions with coverage smaller INT [0]")
parser.add_argument("-r", "--splitRG", dest="allRG", action="store_true", help="split by read group")
parser.add_argument("-t", "--trimPE", dest="trimPE", action="store_true", help="only consider first read in overlapping PE reads")
parser.add_argument("-b", "--bs", dest="isBS", action="store_true", help="input is WGBS sequencing data")
parser.add_argument("-n", "--name", metavar="STR", dest="sname", default="input.bam", action="store", help="sample name to be used in output file [input.bam]")
groupinput = parser.add_argument_group("input files, required")
groupinput.add_argument("inPOS", metavar="positions.bed", action="store", default=False, help="list of SNP positions to be assessed")
groupinput.add_argument("inBAM", metavar="alignments.bam", action="store", default=False, help="alignment file (sorted, indexed)")
args = parser.parse_args(sysargv)
try:
posIN = open(args.inPOS, "r")
samfileIN = pysam.Samfile(args.inBAM, "rb")
out = sys.stdout
except IOError as strerror:
sys.exit("methylCtools bsnp: error: %s" % strerror)
if args.qf: sys.stderr.write("%s command: %s\n" % (nicetime(), " ".join(sys.argv)))
######################################
# prepare
if args.sname == "input.bam": sname = args.inBAM.split("/")[-1]
else: sname = args.sname
h = samfileIN.header # remove "chr" to match bam file, undo later
#print h
flagCHR = h["SQ"][1]["SN"][:3] == "chr"
rgs = ["combined"] # check read groups
if args.allRG:
if "RG" in h:
for rg in h["RG"]: rgs += [rg["ID"]]
if args.qf: sys.stderr.write("%s status: %i read groups in input file\n" % (nicetime(), len(rgs)-1))
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
######################################
# main
snps = dict() # dict of all snp positions
snpsv = [] # vector of all snp names
c = dict() # c[readgroup][snp] = [red, green]
for rg in rgs: c[rg] = dict()
n = 0
for l in posIN:
if l[0] == "#": continue # process SNPs
ls = l.rstrip().split("\t")
snp = [ls[0], int(ls[2]), ls[5], ls[6], ls[7], ls[3]]
if snp[2] == "-": # convert to + strand (as in pileup)
snp[2] = "+"
snp[3] = complement[snp[3]]
snp[4] = complement[snp[4]]
if not flagCHR: snp[0] = snp[0][3:]
snps[ls[3]] = snp
snpsv += [ls[3]]
for rg in rgs: c[rg][snp[-1]] = [0, 0]
for pileupcolumn in samfileIN.pileup(snp[0], start=snp[1]-5, end=snp[1]+3):
if pileupcolumn.pos == snp[1]-1:
if args.trimPE: # trim overlapping PE reads: check if qname occurs more than once
once, twice = [], []
for pileupread in pileupcolumn.pileups:
if pileupread.alignment.qname in once: twice += [pileupread.alignment.qname]
else: once += [pileupread.alignment.qname]
for pileupread in pileupcolumn.pileups:
if not pileupread.is_del: # check if deletion at pos
if args.allRG: # get read group
patags = dict(pileupread.alignment.tags)
if "RG" in patags: rg = patags["RG"]
else: rg = False
else: rg = False
if args.trimPE: # trim overlapping PE reads
if pileupread.alignment.qname in twice and pileupread.alignment.is_read2: continue
if args.isBS: # only use informative strand in BS mode (C could be converted to T)
if snp[3] == "C" or snp[4] == "C":
if pileupread.alignment.is_read1 != pileupread.alignment.is_reverse: continue
elif snp[3] == "G" or snp[4] == "G":
if pileupread.alignment.is_read1 == pileupread.alignment.is_reverse: continue
if pileupread.alignment.qual[pileupread.qpos] < (args.baseQ+33): continue
if pileupread.alignment.mapq < args.mapQ: continue # check for minimum mapQ and baseQ
if pileupread.alignment.seq[pileupread.qpos] == snp[3]:
if rg: c[rg][snp[-1]][0] += 1 # count
c["combined"][snp[-1]][0] += 1
elif pileupread.alignment.seq[pileupread.qpos] == snp[4]:
if rg: c[rg][snp[-1]][1] += 1
c["combined"][snp[-1]][1] += 1
n += 1
if n%100 == 0:
if args.qf: sys.stderr.write("%s status: %i positions processed\n" % (nicetime(), n))
out.write("# bsnp v1.1 output\n")
out.write("# call: %s\n" % " ".join(sys.argv))
for s in snpsv:
if not flagCHR: snps[s][0] = "chr" + snps[s][0]
for rg in rgs:
n = 0
if rg == "combined": snamerg = sname
else: snamerg = sname + ":" + rg
for s in snpsv:
snp = snps[s]
if sum(c[rg][s]) >= args.minCOV:
if c[rg][s][0]+c[rg][s][1] > 0: out.write("%s\t%s\t%i\t%s\t%s\t%i\t%i\t%.3f\n" % (snamerg, snp[0], snp[1], "*", snp[-1], c[rg][s][0], c[rg][s][1], c[rg][s][0]/float(c[rg][s][0]+c[rg][s][1])))
else: out.write("%s\t%s\t%i\t%s\t%s\t%i\t%i\tNA\n" % (snamerg, snp[0], snp[1], "*", snp[-1], c[rg][s][0], c[rg][s][1]))
n += 1
if args.qf:
if rg == "combined": sys.stderr.write("%s status: %i SNPs covered in all read groups (coverage >= %i)\n" % (nicetime(), n, args.minCOV))
else: sys.stderr.write("%s status: %i SNPs covered in read group %s (coverage >= %i)\n" % (nicetime(), n, rg, args.minCOV))
if __name__ == "__main__":
import sys
mod_bsnp(sys.argv[1:])