-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtaxon_regroup.py
64 lines (61 loc) · 2.02 KB
/
taxon_regroup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import sys
import glob
import os
from Bio import SeqIO
from Bio import AlignIO
from Bio import Phylo
if len(sys.argv) >= 3:
if sys.argv[1] == "-tree":
print "tree mode"
treefile = sys.argv[3]
tree = Phylo.read(treefile, "newick")
tree.ladderize()
order = tree.get_terminals()
count = 0
d = {}
for x in order:
print x.name
d[x.name] = count
count+=1
elif sys.argv[1] == "-seqlen":
print "seqlen mode"
elif sys.argv[1] == "-seqname":
form = sys.argv[3]
print "seqname mode, format:", form
else:
print "FORMAT: python taxon_regroup.py [option: -tree (regroup based on tree topology), -seqlen (regroup based on seqlen)] [folder] ([tree file])"
print "EXAMPLE: python taxon_regroup.py -tree ./fasta tree.tre"
print "EXAMPLE: python taxon_regroup.py -seqlen ./fasta"
sys.exit()
inputfolder = sys.argv[2]
if sys.argv[1] == "-seqname":
files = glob.glob(inputfolder+"/*")
else:
files = glob.glob(inputfolder+"/*.fas")
if not os.path.exists ("./regrouped"):
os.makedirs("./regrouped")
for f in files:
fhandle = open(f, "r")
sortdict = {}
if not sys.argv[1] == "-seqname":
form = "fasta"
for seq in SeqIO.parse(fhandle, form):
sortdict[seq.id] = seq.seq
fhandle.close()
fnew = f.split("/")
fn = fnew[len(fnew)-1]
fn2 = "./regrouped/"+fn.split(".")[0]+".fas"
fhandle2 = open(fn2, "w")
if sys.argv[1] == "-seqlen":
for key in sorted(sortdict, key=lambda value: len(str(sortdict[value]).replace("-", "").upper().replace("N", "").replace("?", "").replace("X", "")), reverse = True):# lambda r: len(str(value).replace("-", "").upper().replace("N", ""))):
print >> fhandle2, ">"+str(key)
print >> fhandle2, sortdict[key]
if sys.argv[1] == "-tree":
for key in sorted(sortdict, key = lambda r: d[r]):# lambda r: len(str(value).replace("-", "").upper().replace("N", ""))):
print >> fhandle2, ">"+str(key)
print >> fhandle2, sortdict[key]
if sys.argv[1] == "-seqname":
for key in sorted(sortdict, key=lambda value: value[0]):
print >> fhandle2, ">"+str(key)
print >> fhandle2, sortdict[key]
fhandle2.close()