-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuniqify-genomes.py
executable file
·152 lines (123 loc) · 5.29 KB
/
uniqify-genomes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#! /usr/bin/env python
"""
Peform an iterative, greedy clustering of a pile of (genome) files, using
sourmash signatures in memory to reduce memory consumption.
This is a practical alternative to a more principled clustering; see
/~https://github.com/ctb/2017-sourmash-cluster for that.
Author: C. Titus Brown, github.com/ctb/, titus@idyll.org.
This code is under CC0.
"""
import sys
import argparse
import random
import csv
import screed
import os
import shutil
import gzip
import sourmash
from sourmash import load_file_as_signatures
from sourmash.logging import notify
def main():
p = argparse.ArgumentParser()
p.add_argument('signature_sources', nargs='+',
help='signature files, directories, and sourmash databases')
p.add_argument('-k', '--ksize', type=int, default=31)
p.add_argument('--moltype', default='DNA')
p.add_argument('--seed', type=int, default=1)
p.add_argument('--threshold', type=float, default=0.2)
p.add_argument('--prefix', default='clust.',
help='output filename prefix (can be a path)')
p.add_argument('--max-containment', action='store_true',
help="Use max containment instead of similarity to cluster.")
p.add_argument('--merge-files', action='store_true',
help="Merge fasta files, starting with founder.")
args = p.parse_args()
# @CTB ovviously
base_mh = sourmash.MinHash(n=0, ksize=31, scaled=1000)
siglist = []
for filename in args.signature_sources:
notify(f'loading sequences from {filename}')
with screed.open(filename) as screed_iter:
mh = base_mh.copy_and_clear()
for record in screed_iter:
mh.add_sequence(record.sequence, force=True)
siglist.append((filename, mh))
notify(f'loaded {len(siglist)} files total.')
notify(f'setting random number seed to {args.seed} and shuffling seqs')
random.seed(args.seed)
random.shuffle(siglist)
cluster_summary = []
pass_n = 0
while len(siglist):
notify(f'starting pass {pass_n+1}; {len(siglist)} files remaining.')
(founder_from, founder) = siglist.pop()
cluster_summary.append((founder_from, founder, pass_n, 'founder'))
cluster = []
leftover = []
for (sig_from, sketch) in siglist:
if args.max_containment:
score = sketch.max_containment(founder)
else:
score = sketch.similarity(founder)
if score >= args.threshold:
cluster.append((sig_from, sketch))
cluster_summary.append((sig_from, sketch, pass_n, 'member'))
else:
leftover.append((sig_from, sketch))
if cluster:
notify(f'clustered {len(cluster)} genomes with founder {founder_from}')
# output individual founder/cluster
if args.merge_files:
with gzip.open(f"{args.prefix}cluster.{pass_n}.fa.gz", "wt") as fp:
print("writing", founder_from)
for record in screed.open(founder_from):
fp.write(f">{record.name}\n{record.sequence}\n")
for ff, _ in cluster:
print("writing", ff)
for record in screed.open(ff):
fp.write(f">{record.name}\n{record.sequence}\n")
pass
else:
prefix = f'{args.prefix}cluster.{pass_n}'
try:
os.mkdir(prefix)
except FileExistsError:
print(f"note - '{prefix}' already exists.")
# shutil copy instead
shutil.copy(sig_from, prefix)
for (ff, _) in cluster:
shutil.copy(ff, prefix)
print(f'saved founder and {len(cluster)} files to {prefix}')
else:
notify(f'founder {founder_from} is a singleton.')
if args.merge_files:
with gzip.open(f"{args.prefix}cluster.{pass_n}.fa.gz", "wt") as fp:
print("writing", founder_from)
for record in screed.open(founder_from):
fp.write(f">{record.name}\n{record.sequence}\n")
else:
prefix = f'{args.prefix}cluster.{pass_n}'
try:
os.mkdir(prefix)
except FileExistsError:
print(f"note - '{prefix}' already exists.")
shutil.copy(founder_from, prefix)
print(f'copied singleton file to {prefix}')
siglist = leftover
pass_n += 1
if 0:
# output summary spreadsheet
headers = ['origin_path', 'name', 'filename', 'md5sum', 'cluster', 'member_type']
csv_name = f'{args.prefix}summary.csv'
with open(csv_name, 'wt') as fp:
w = csv.writer(fp)
w.writerow(headers)
for (origin_path, sig, cluster_n, member_type) in cluster_summary:
name = str(sig)
filename = sig.filename
md5sum = sig.md5sum()
w.writerow([origin_path, name, filename, md5sum, cluster_n, member_type])
notify(f"wrote {len(cluster_summary)} entries to clustering summary at '{csv_name}'")
if __name__ == '__main__':
sys.exit(main())