-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtrimandpoolfastq.py
169 lines (157 loc) · 5.56 KB
/
trimandpoolfastq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
##################################
# #
# Last modified 07/10/2011 #
# #
# Georgi Marinov #
# #
##################################
import sys
try:
import psyco
psyco.full()
except:
pass
def main(argv):
if len(argv) < 3:
print 'usage: python %s <list of input files> outfileprefix <bpToKeep | max> [-trim5 bp] [-flowcellID flowcell] [-addEnd 1 | 2] [-replace string newstring | blank]' % argv[0]
sys.exit(1)
inputfilename = argv[1]
doMax=False
if argv[3] == 'max':
doMax=True
trim='max'
else:
trim = int(argv[3])
outputfilename = argv[2] + '.' + str(trim) + 'mers.fastq'
doFlowcellID=False
if '-flowcellID' in argv:
doFlowcellID=True
flowcellID=argv[argv.index('-flowcellID')+1]
print 'will include flowcell ID', flowcellID, 'in reads headers'
dotrim5=False
if '-trim5' in argv:
dotrim5=True
trim=int(argv[argv.index('-trim5')+1])
print 'will trim ', trim, 'bp from the 5-end'
outputfilename = inputfilename.split('.fastq')[0] + '.' +str(trim)+'bp-5prim-trim.fastq'
doAddEnd=False
if '-addEnd' in argv:
doAddEnd=True
END=argv[argv.index('-addEnd')+1]
print 'will add', '/'+END, 'to read IDs'
doReplace=False
if '-replace' in argv:
doReplace=True
oldstring=argv[argv.index('-replace')+1]
newstring=argv[argv.index('-replace')+2]
if newstring == 'blank':
newstring=''
print 'will replace', oldstring, 'with', newstring, 'in read IDs'
linelist=open(inputfilename)
outfile = open(outputfilename, 'w')
for line1 in linelist:
file=line1.strip().split('\t')[0]
input_stream = open(file)
i=0
shorter=0
if dotrim5:
i=1
j=0
for line in input_stream:
if i==3 and line[0]=='+':
plus='+\n'
i=4
continue
if i==1 and line[0]=='@':
if doFlowcellID and flowcellID not in line:
ID='@'+flowcellID+'_'+line.replace(' ','_')[1:-1]+'\n'
else:
ID=line.replace(' ','_')
if doAddEnd:
ID=ID.strip()+'/'+END+'\n'
if doReplace:
ID=ID.replace(oldstring,newstring)
i=2
continue
if i==4:
scores=line
i=1
if doMax:
scores=line
else:
scores=line[trim:len(line)]+'\n'
continue
if i==2:
i=3
plus=''
plus=ID
j=j+1
if j % 1000000 == 0:
print file, j, 'reads processed'
if doMax:
outfile.write(ID)
outfile.write(line.replace('.','N'))
outfile.write(plus)
outfile.write(scores)
else:
sequence=line[0:trim].replace('.','N')+'\n'
outfile.write(ID)
outfile.write(sequence)
outfile.write(plus)
outfile.write(scores)
continue
else:
i=1
j=0
for line in input_stream:
if i==1 and line[0]=='@':
if doFlowcellID and flowcellID not in line:
ID='@'+flowcellID+'_'+line.replace(' ','_')[1:-1]+'\n'
else:
ID=line.replace(' ','_')
if doAddEnd:
ID=ID.strip()+'/'+END+'\n'
if doReplace:
ID=ID.replace(oldstring,newstring)
i=2
continue
if i==2:
i=3
j=j+1
if j % 1000000 == 0:
print file, j, 'reads processed'
if doMax:
sequence=line
else:
if len(line.strip())<trim:
shorter+=1
sequence=line.strip().replace('.','N')+'\n'
else:
sequence=line[0:trim].replace('.','N')+'\n'
continue
if i==3 and line[0]=='+':
plus='+\n'
i=4
continue
if i==4:
i=1
if doMax:
scores=line
outfile.write(ID)
outfile.write(sequence)
outfile.write(plus)
outfile.write(line)
else:
if len(line.strip())<trim:
continue
scores=line[0:trim]+'\n'
outfile.write(ID)
outfile.write(sequence)
outfile.write(plus)
outfile.write(scores)
continue
outfile.close()
if shorter>0:
print shorter, 'sequences shorter than desired length'
if __name__ == '__main__':
main(sys.argv)