-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathgenbank_nodes_and_names_to_taxonomy.py
78 lines (57 loc) · 3.66 KB
/
genbank_nodes_and_names_to_taxonomy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/python3
import sys
names_file = sys.argv[1] #'/mnt/lustre/hcgs/joseph7e/databases/ncbi_taxonomy_db/names.dmp'#sys.argv[1]
nodes_file = sys.argv[2] #'/mnt/lustre/hcgs/joseph7e/databases/ncbi_taxonomy_db/nodes.dmp' #sys.argv[2]
names_dictionary = {} # tax_id: name_text
nodes_dictionary = {} # tax_id: rank, parent_tax_id
#master_dict[id] = [rank,name_text,parent_tax_id]
print ('Parsing names.dmp file for tax_id and scientific_name...')
for l in open(names_file):
tax_id, name_text, unique_name, name_class, tmp = [x.lstrip().rstrip() for x in l.rstrip().split('|')]
if tax_id not in names_dictionary.keys() and name_class == 'scientific name' : # only get one item added for each tax_id using valid id (scientific name)
names_dictionary[tax_id] = name_text
#print (tax_id, name_text, unique_name, name_class)
print ('A total of {} unique entries in the names database'.format(len(names_dictionary.keys())))
print (" (this number should equal --> awk -F'\t|' '{print $1}' names.dmp | uniq | wc )")
print ('Parsing nodes.dmp for tax_id, parent_tax_id and taxonomy rank...')
print (' (this file also has genetic code, mito_gen_cod and parent codes as well)')
for n in open(nodes_file):
node_elements = [x.lstrip().rstrip() for x in n.rstrip().split('|')]
#tax_id, parent_tax_id, rank, embl_code, division_id, tmp, gen_code, tmp, mito_code, tmp, tmp, tmp, comments, tmp = [x.lstrip().rstrip() for x in n.rstrip().split('|')]
tax_id, parent_tax_id, rank = node_elements[0:3]
if tax_id not in nodes_dictionary.keys():
nodes_dictionary[tax_id] = [rank, parent_tax_id]
print ('A total of {} unique entries in the names database'.format(len(nodes_dictionary.keys())))
def Expand_taxonomy(tax_id, names_dictionary, nodes_dictionary):
tax_levels = ['superkingdom', 'kingdom', 'phylum', 'subphylum', 'superclass', 'class', 'subclass', 'superorder', 'order', 'superfamily', 'family', 'subfamily', 'genus', 'species']
growing_taxonomy = [names_dictionary[tax_id]]
growing_taxonomy_levels = [nodes_dictionary[tax_id][0]]
master_key = tax_id
while True:
new_tax_id = nodes_dictionary[tax_id][1] # parent_tax_id
new_name = names_dictionary[new_tax_id]
new_rank = nodes_dictionary[new_tax_id][0]
if new_name == 'root':
break
growing_taxonomy.append(new_name)
growing_taxonomy_levels.append(new_rank)
tax_id = new_tax_id
reversed_taxonomy = list(reversed(growing_taxonomy))
reversed_taxonomy_levels = list(reversed(growing_taxonomy_levels))
#final_tax = ['unknown_superkingdom', 'unknown_kingdom', 'unknown_phylum', 'unknown_subphylum', 'unknown_superclass', 'unknown_class', 'unknown_subclass', 'unknown_superorder', 'unknown_order', 'unknown_superfamily', 'unknown_family', 'unknown_subfamily', 'unknown_family', 'unknown_genus', 'unknown_species']
final_tax = ['unknown_' + x for x in tax_levels]
for t in range(len(tax_levels)):
for i in range(len(reversed_taxonomy_levels)):
if reversed_taxonomy_levels[i] == tax_levels[t]:
final_tax[t] = reversed_taxonomy[i]
# format_tax = []
# for i in range(len(final_tax)):
# format_tax.append('D_'+str(i)+'__'+final_tax[i])
#combined_taxonomy = ';'.join(format_tax)
combined_taxonomy = ';'.join(final_tax)
#print (master_key+'\t'+combined_taxonomy)
return master_key, combined_taxonomy
output_file = open('expanded_ncbi_taxonomy.tsv', 'w')
for tax_id in list(names_dictionary.keys()):#.sort(key=int):
t, expanded_taxonomy = Expand_taxonomy(tax_id, names_dictionary, nodes_dictionary)
output_file.writelines(t + '\t' + expanded_taxonomy+'\n')