forked from phbradley/tcr-dist
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlogo_tools.py
94 lines (73 loc) · 2.82 KB
/
logo_tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import amino_acids
import sys
def get_alphabet( pwm ):
alphabet = pwm[0].keys()[:]
alphabet.sort()
return alphabet
def check_pwm( pwm, tol= 0.001 ):
L = len(pwm)
alphabet = get_alphabet( pwm )
for pos in range(L):
for aa,val in pwm[pos].iteritems(): assert val > -1e-6
total = sum( ( pwm[pos][aa] for aa in alphabet ) )
assert abs( total - 1.0 ) - tol
def create_protein_pwm_from_sequences( seqs, pseudocounts = 0.0 ):
return create_pwm_from_sequences( seqs, amino_acids.amino_acids, pseudocounts=pseudocounts )
def create_dna_pwm_from_sequences( seqs, pseudocounts=0.0 ):
return create_pwm_from_sequences( seqs, list('acgt'), pseudocounts=pseudocounts )
def create_pwm_from_sequences( seqs, alphabet, pseudocounts=0.0 ):
pwm = {}
if len(seqs) == 0: return pwm
L = len( seqs[0] )
for pos in range(L):
pwm[ pos ] = dict( zip( alphabet, [pseudocounts]*len(alphabet) ) )
for s in seqs:
assert len(s) == L
for pos in range(L):
if s[pos] not in alphabet:
sys.stderr.write('logo_tools.create_pwm_from_sequences: skipping bad character %s\n'%(s[pos]))
continue
pwm[ pos ][ s[pos] ] += 1
for pos in range(L):
norm = 1.0 / sum( pwm[pos].values() )
for a in alphabet: pwm[ pos ][ a ] *= norm
check_pwm( pwm )
return pwm
base_partner = {'a': 't', 'c': 'g', 'g': 'c', 't': 'a', 'n': 'n',
'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N',
'R': 'Y', 'Y': 'R',
'S': 'S', 'W': 'W',
'K': 'M', 'M': 'K',
'.': '.' }
nucleotide_classes_lower_case = { 'a':'a',
'c':'c',
'g':'g',
't':'t',
'w':'at',
's':'cg',
'k':'gt',
'm':'ac',
'y':'ct',
'r':'ag',
'n':'acgt' }
nuc_match_lower_case = {}
for x in nucleotide_classes_lower_case:
for y in nucleotide_classes_lower_case:
nuc_match_lower_case[(x,y)] = False
for x1 in nucleotide_classes_lower_case[x]:
for y1 in nucleotide_classes_lower_case[y]:
if x1==y1:
nuc_match_lower_case[(x,y)] = True
break
def nucleotide_symbols_match( a_in, b_in ):
a = a_in.lower()
b = b_in.lower()
if a==b: return True
return nuc_match_lower_case.get( (a,b), False )
def reverse_complement( seq ):
newseq = ''
L = len(seq)
for pos in range( L-1, -1, -1 ):
newseq += base_partner[ seq[ pos ] ]
assert len( newseq ) == L
return newseq