-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaccToFasta.py
86 lines (63 loc) · 2.5 KB
/
accToFasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env python
import argparse
from Bio import SeqIO, Entrez
import matplotlib.pyplot as plt
import os
def arg_parse():
parser = argparse.ArgumentParser(description='HIV sequence analysis')
parser.add_argument('-p', type=str, default=None,
help='Path of ascension files')
return parser.parse_args()
# d: Counter(), p_name:
def plotCounter(d, filename, xName="genotype"):
with plt.style.context("seaborn"):
fig = plt.figure(1, [16, 9])
# plt.rcParams.update({'font.size': 25})
tmp = sorted(zip(d.keys(), d.values()), key=lambda x: -x[1])
keys = [x[0] for x in tmp]
values = [y[1] for y in tmp]
plt.bar(keys, values)
plt.xticks(
rotation=45,
horizontalalignment='right',
# fontweight='heavy',
# fontsize='small'
)
plt.xlabel(xName, fontsize=20)
plt.ylabel('Count', fontsize=20)
plt.title(f'Frequency of each {xName} based on genbank notes', fontsize=20)
# Print image
name = f'{filename}-distribution.png'
plt.savefig(name, bbox_inches='tight', dpi=300)
plt.clf()
### -------------------------------------- Main method start --------------------------------------- ##
def main(args):
''' -------------- Parse list of records ----------------'''
Entrez.email = 'andrewclchan211@vt.edu'
Entrez.apikey = ""
maxUnitsInt = int(25 * (10 ** 3))
outFile = open(os.path.splitext(args.p)[0] + '.fasta', "w")
# If no accession path was passed in
if not args.p:
return
else:
print(f"Reading from file: {args.p}")
with open(args.p, "r") as file:
recordsIds = file.read().splitlines()
recordsIds = recordsIds[0:min(len(recordsIds), maxUnitsInt)]
print(recordsIds)
recordsIds = set(recordsIds)
with Entrez.efetch(db="Protein", id=",".join(list(recordsIds)), rettype="gb", retmode="text") as handle:
for seq_record in SeqIO.parse(handle, "gb"):
seq_record.id = seq_record.annotations['accessions'][0]
if seq_record.id not in recordsIds:
print(f"WARN invalid seq acc: {seq_record.name}")
print(seq_record)
seq_record.description = ""
SeqIO.write(seq_record, outFile, 'fasta')
outFile.close()
### -------------------------------------- Main method end --------------------------------------- ##
if __name__ == "__main__":
args = arg_parse()
print("Starting program")
main(args)