-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmiRNA_targets_v0.3.py
196 lines (177 loc) · 7.93 KB
/
miRNA_targets_v0.3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# -*- coding: utf-8 -*-
import numpy as np
import sys
import math
import scipy.stats
from collections import OrderedDict
args = sys.argv
# input arguments
inFile = open(args[1])
miR_file= open(args[2])
k =int(args[3])
#miR_file= open("../top50_ginger.txt")
out = open (str(k) +"mers.txt","w")
hits_file = open (str(k) +"mers_hits.txt","w")
# a function to read a text file in FASTA format and store it into a dictionary
def fastaToDict(inFile):
dict = {}
for line in inFile:
if line[0] == '>':
splitted = line.split(' ')
seqID = splitted[0].strip()[1:] # unique transcript ID
dict[seqID] = ""
else:
dict[seqID] += line.strip().upper()
return dict
# a function to calculate the probability of having a specific sequence "seq" given and first order markov chain (transition_matrix)
def calculate_prob_MM(seq , transition_matrix):
transition_dict ={ "AA":transition_matrix[0],"AC":transition_matrix[1],"AG":transition_matrix[2],"AT":transition_matrix[3],
"CA":transition_matrix[4],"CC":transition_matrix[5],"CG":transition_matrix[6],"CT":transition_matrix[7],
"GA":transition_matrix[8],"GC":transition_matrix[9],"GG":transition_matrix[10],"GT":transition_matrix[11],
"TA":transition_matrix[12],"TC":transition_matrix[13],"TG":transition_matrix[14],"TT":transition_matrix[15]}
p = 0.25
for i in range(1,len(seq)):
p *= transition_dict[seq[i-1] + seq[i]]
return p
# a function to calculate the first order markov chain (transition_matrix) from a sequence
def firstOrderMMConstruction(sequence):
seq_len=len(sequence)
transition_counts ={ "AA":0,"AC":0,"AG":0,"AT":0,
"CA":0,"CC":0,"CG":0,"CT":0,
"GA":0,"GC":0,"GG":0,"GT":0,
"TA":0,"TC":0,"TG":0,"TT":0}
for i in range(1,seq_len):
if sequence[i] in [ 'A', 'C' ,'G', 'T'] and sequence[i-1] in [ 'A' ,'C' ,'G' ,'T']:
transition_counts[sequence[i-1]+sequence[i]] += 1.0
matrix= OrderedDict(sorted(transition_counts.items(), key=lambda t: t[0])) # to restore dictionary original order
values = np.fromiter(matrix.values(), dtype=int)
values = values + 1.0 # adding pseudo_count
new_values = values
new_values[0:4] = values[0:4]/sum(values[0:4])
new_values[4:8] = values[4:8]/sum(values[4:8])
new_values[8:12] = values[8:12]/sum(values[8:12])
new_values[12:16] = values[12:16]/sum(values[12:16])
print(matrix)
return new_values
# a function to generate a sequence of length "seq_len" given first order markov chain (transition_matrix)
def firstOrderMMGeneration(transitionMatrix,seq_len):
alphabet = ['A','C','G','T']
s= ''.join(np.random.choice(alphabet,1,[.25,.25,.25,.25])) # generate random first letter
for i in range(1,seq_len):
if s[i-1] == 'A':
s += ''.join(np.random.choice(alphabet,1,p = transitionMatrix[0:4]))
elif s[i-1] == 'C':
s += ''.join(np.random.choice(alphabet,1,p = transitionMatrix[4:8]))
elif s[i-1] == 'G':
s += ''.join(np.random.choice(alphabet,1,p = transitionMatrix[8:12]))
else:
s += ''.join(np.random.choice(alphabet,1,p = transitionMatrix[12:16]))
return s
# a hash function that takes a sequence and return a hash value
def encoding(seq):
k=len(seq)
str=''
c=''
for i in range(0,k):
if(seq[i] == 'A' or seq[i] == 'a'):
str='00'
elif (seq[i] == 'C' or seq[i] == 'c'):
str='01'
elif (seq[i] == 'G' or seq[i] == 'g'):
str='10'
elif (seq[i] == 'T' or seq[i] == 't'):
str='11'
elif (seq[i] == 'N' or seq[i] == 'n'):
return -1
c=c+str
return int(c,2)
def reverseComplement (s):
complement = {'A':'T','C':'G','G':'C','T':'A'}
return ''.join([ complement[s[i]] for i in range(len(s)-1,-1,-1)])
def indexing(subject,k): # building hash table for subject database (sequences to be searched)
num_seqs=len(subject)
hash_tbl= [[] for i in range(4**k)]
keys = list(subject.keys())
for i in range(0,num_seqs):
seq=subject[keys[i]]
for j in range(0,len(seq) -k + 1): # enumerating overlapping k-mers
k_mer= seq[j:j+k]
hash_tbl[encoding(k_mer)].append([i+1,j+1]) # 1_based indexing
return hash_tbl,list(subject.keys())
def main(miR_file,k):
utr_regions = fastaToDict(inFile)
len_dict = {}
firstOrderModels = {}
utr_regions_keys = list(utr_regions.keys())
for p in range(0,len(utr_regions)):
key = utr_regions_keys[p]
firstOrderModels[key] = firstOrderMMConstruction(utr_regions[key])
len_dict [key] = float(len(utr_regions[key]))
[hash_tbl, transcriptIDs] = indexing(utr_regions,k)
dict = {}
miR_seed = {}
# reading miRNAs sequences files
for line in miR_file:
splitted = line.split(" ")
if line[0] == '>':
if len(splitted[0]) > 1:
id = splitted[0][1:].strip()
else:
id = splitted[1].strip()
else :
if k == 8:
seq = splitted[0][0:8]
elif k ==7:
seq = splitted[0][1:8] # seed region only
elif k == 6:
seq = splitted[0][1:7]
else:
seq = splitted[0][3:8]
seq = seq.replace("U","T")
seq = seq.replace("u","t")
miR_seed [id] = seq.strip()
dict[id] = {}
for n in range(0,len(transcriptIDs)):
dict[id] [transcriptIDs[n]] = 0.0
for j in range(0,len(seq) - k + 1): # enumerating overlapping kmers
k_mer= seq[j:j+k]
kmer_rev = reverseComplement(k_mer)
hits = hash_tbl[encoding(kmer_rev)]
for i in range(0,len(hits)):
hits_file.write(k_mer +"\t"+ id +"\t"+ transcriptIDs[hits[i][0]-1] +"\t" + str(hits[i][1]) +"\t" + str(len_dict[transcriptIDs[hits[i][0]-1]]) + "\n" )
dict[id] [transcriptIDs[hits[i][0]-1]] += 1
return dict,len_dict,firstOrderModels ,miR_seed
#########################################################
# rwriting output files
hits_file.write("miRNA_seed" + "\t" +"miRNA_ID" + '\t' +"targetSeqID" + "\t" + "hit_start_position" + "\t" +
"targetSeqLength"+ "\n")
[dict, len_dict,models,miR_seed] = main(miR_file,k)
out.write('%-14s \t %20s \t %20s \t %25s \t %30s \t %5s\n' %("miRNA_seed_" + str(k)+ "nt","miRNA_ID", "targetSeqID",
"#_of_occurrences_of_"+str(k)+"mers","#_of_all_possible_" + str(k)+"mers_in_3UTR", "pValue"))
pValue = {}
dKeys = list(dict.keys())
for i in range(0,len(dict)):
mir = dKeys[i]
keys = list(dict[mir].keys()) # transcripts IDs
pValue[mir] ={}
num_ocur = 0
num_possible_kmers = 0
for j in range(0,len(dict[mir])):
pValue[mir][keys[j]] = 0.0
for j in range(0,len(dict[mir])):
num_ocur = dict[mir][keys[j]]
num_possible_kmers = float(len_dict[keys[j]]-k+1)
p = calculate_prob_MM(miR_seed[mir], models[keys[j]])
transc_p = scipy.stats.binom.sf(num_ocur-1,num_possible_kmers,p )
if transc_p >0:
pValue[mir][keys[j]] += -2.0 * math.log(transc_p) # gene-based p-values !!!!!!!!!!!!
else:
transc_p = 1
pValue[mir][keys[j]] += -2.0 * math.log(transc_p)
out.write('%-14s \t %20s \t %20s \t %25s \t %30s \t %4f \n' %(miR_seed[mir],mir,keys[j], str(num_ocur), str(num_possible_kmers)
, transc_p ))
# close files
inFile.close()
miR_file.close()
out.close()
hits_file.close()