-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain_test.py
667 lines (525 loc) · 23 KB
/
main_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
import sys, os, csv, re
from block_node import Node
from fosmid_edge import Edge
import argparse
from contig_loc import ContigLocation
import pysam
import pdb
############# TO RUN ###############
# from graph directory:
# python main.py full_node_list.txt full_unique_mapped_ends.txt
parser = argparse.ArgumentParser(description='Create initial graph from list of alignments and fosmid paired ends, then move nodes with edges in different scaffolds')
parser.add_argument('alignments', metavar='*.tsv', help='a tab delimited reference to assembly alignment file from nucmer')
parser.add_argument('fosmids', metavar='*.tsv', help = 'a fosmid paired end file')
args = parser.parse_args()
alignment_file = args.alignments
fosmid_pairs = args.fosmids
scaffolds = [] #contains head node for every scaffold, each of which is a linked list
line_indexed_nodes = [] #to retrieve node at line n of alignment_file, call line_indexed_nodes[n-1]
bad_edges = [] #used for keeping track of "bad" edges (those with endpoints in different scaffolds) so that they can be handled accordingly
with open(alignment_file) as a_f:
alignment_data = csv.reader(a_f, delimiter="\t")
#TODO refactor to use Scaffold (upcoming) data structure
#initialize some dummy data so my script can dynamically begin construction from the
#beginning of the file, instead of needing to hard code the initial case
prev_scaf = " "
dummy_CL = ContigLocation("dummy", 0, 0)
dummy_node = Node(-1, dummy_CL, dummy_CL)
curr_node = dummy_node
for line_id, block in enumerate(alignment_data):
print(line_id)
#load in data from the current line
asm_scaf = block[0]
asm_start = int(block[1])
asm_stop = int(block[2])
ref_chr = block[3]
ref_start = int(block[4])
ref_stop = int(block[5])
line_num = int(block[8])
if(prev_scaf != asm_scaf): #end of an assembly scaffold
curr_node.next = None #TODO may not be necessary
curr_ref_CL = ContigLocation(ref_chr, ref_start, ref_stop)
curr_asm_CL = ContigLocation(asm_scaf, asm_start, asm_stop)
curr_node = Node(line_num, curr_ref_CL, curr_asm_CL)
#TODO again, hardcoded while testing; make dynamic parameter in later version
region = asm_scaf + ":" + str(asm_start) + "-" + str(asm_stop)
#faidx returns a string; it begins with the scaffold name, followed by a \n, after which one or more lines
#of DNA are present, also separated by \n; only care about raw string sequence, hence split and [1:]
node_seq = "".join(pysam.faidx("assembly.fasta", region).split()[1:])
curr_node.seq = node_seq
assert len(node_seq) == len(curr_node.asm)
#within this case, curr_node is the first node in a scaffold, so add it to the list of head nodes
scaffolds.append(curr_node)
line_indexed_nodes.append(curr_node)
prev_scaf = asm_scaf
#TODO isn't this just an else case ~~~ 6/7/18- why not?
continue #work is done for this cycle, and thanks to python's scope (or lack thereof)
#we can just skip to the next iteration of the loop
new_ref_CL = ContigLocation(ref_chr, ref_start, ref_stop)
new_asm_CL = ContigLocation(asm_scaf, asm_start, asm_stop)
new_node = Node(line_num, new_ref_CL, new_asm_CL, p_node = curr_node)
#TODO again, hardcoded while testing; make dynamic parameter in later version
region = asm_scaf + ":" + str(asm_start) + "-" + str(asm_stop)
#this returns scaffold name and sequence string separated by \n; only care about seq, hence split and [1]
node_seq = "".join(pysam.faidx("assembly.fasta", region).split()[1:])
new_node.seq = node_seq
assert len(node_seq) == len(new_node.asm)
curr_node.next = new_node
curr_node = new_node
line_indexed_nodes.append(curr_node)
prev_scaf = asm_scaf
with open(fosmid_pairs) as f_p:
pair_data = csv.reader(f_p, delimiter="\t")
for pair in pair_data:
left_ref_start = int(pair[1])
left_ref_stop = int(pair[2])
left_asm_start = int(pair[4])
left_asm_stop = int(pair[5])
left_block_line = int(pair[7])
right_ref_start = int(pair[9])
right_ref_stop = int(pair[10])
right_asm_start = int(pair[12])
right_asm_stop = int(pair[13])
right_block_line = int(pair[15])
node1 = line_indexed_nodes[left_block_line - 1]
node2 = line_indexed_nodes[right_block_line - 1]
edge = Edge(node1, node2, left_ref_start, left_ref_stop, left_asm_start, left_asm_stop, right_ref_start, right_ref_stop, right_asm_start, right_asm_stop)
if node1 is node2: #prevent duplicate edges in the same node
node1.add_edge(edge)
else:
node1.add_edge(edge)
node2.add_edge(edge)
if edge.weight == -10:
bad_edges.append(edge)
#don't need this anymore- clear memory and unnecessary references that may keep nodes removed from main assembly alive and "orphaned"
line_indexed_nodes = []
#TODO hardcoded during testing; make this an input parameter so it's dynamic TODO
samfile = pysam.AlignmentFile("../novoalign/imp3.merged.sorted.bam", "rb")
def find_chunk_region(s_e, b_n, o_n):
seed_edge = s_e
bad_node = b_n
other_node = o_n
assert bad_node.asm.name != other_node.asm.name
#initialize variables
searched_nodes = [bad_node]
region_lo = seed_edge.edge_low(bad_node)
region_hi = seed_edge.edge_high(bad_node)
other_name = other_node.asm.name
#hack to make the search loops handle all the work, even the initial case, despite possible edge case (if bad_node has sorted edges at either end
#that match to another scaffold's node, aside from other_node, search would fail to extend boundaries from seed_edge)
initial_edges = bad_node.get_sorted_edges()
#TODO in this case it should be more efficient to write a single loop with an if/elif/else block placing into left/right/edge list respectively
# initial_left_list = [ e for e in initial_edges if e.edge_low(bad_node) < seed_edge.edge_low(bad_node) ]
# initial_right_list = [ e for e in initial_edges if e.edge_high(bad_node) > seed_edge.edge_high(bad_node) ]
# edge_cases = [ e for e in initial_edges if e not in initial_left_list and e not in initial_right_list ]
initial_left_list = []
initial_right_list = []
edge_cases = []
for edge in initial_edges:
if edge.edge_low(bad_node) < seed_edge.edge_low(bad_node):
initial_left_list.append(edge)
elif edge.edge_high(bad_node) > seed_edge.edge_high(bad_node):
initial_right_list.append(edge)
else:
edge_cases.append(edge)
# print(len(edge_cases))
assert seed_edge in edge_cases
#edges that will be removed from bad_edges once the chunk region is fully defined
edge_list = set() #TODO construction of this set in this method is not necessarily safe; remove this portion and instead develop & rely on edge_to_remove
edge_list.add(seed_edge)
edge_list.update(edge_cases) #TODO in the future, should actually consider these edges; for now, they are being ignored-
#they are removed from bad_edges, but still exist; may want to explicitly remove them from their containing nodes as well
#however, make sure not to accidentally delete seed_edge, as it will be in edge_cases
#attempt to extend region_hi by searching to the right
curr_node = bad_node #loop iteration variable
continue_search = True #flag to allow breaking out of the outer while loop from the inner for loop
first_iteration = True #flag to set initial conditions in the first iteration of the while loop
while curr_node is not None and continue_search: #if curr_node is None, it's the end of a scaffold
node_searched = False #flag that tracks whether or not region_hi has been extended into this node
if first_iteration:
search_space = initial_right_list
node_searched = True #by definition, since curr_node is bad_node and region_hi is within bad_node
first_iteration = False
else:
search_space = curr_node.get_sorted_edges()
for edge in search_space:
#changed to the existing implementation because of a small but possibly significant edge case:
#edges are sorted in ascending order of .edge_low(), which makes no guarantees as to the ordering of .edge_high()
#for example, the following list is a valid example (with edges displayed as a tuple of (edge_low, edge_high)
# [(1,10), (2,8), (3, 20)]
# region_hi is intended to capture 20, but the line below would only capture up to 10 then halt
# if (edge.weight != -10 or (edge.opposite_node(bad_node) is other_node) ) and (edge.edge_high() > region_hi):
#an edge weight of -10 indicates an edge with endpoints in 2 different scaffolds; thus region_hi can only be
#extended by edges that either have both endpoints in the current scaffold, or have their second endpoint in
#the same scaffold as other_node. Conversely, this means the search halts when either an edge with an endpoint in
#a third scaffold is encountered, or when the end of the scaffold is reached (when curr_node is None)
if edge.weight != -10 or (edge.opposite_node(curr_node).asm.name == other_name):
if edge.edge_high(curr_node) > region_hi:
region_hi = edge.edge_high(curr_node)
node_searched = True
edge_list.add(edge)
else:
continue_search = False #region_hi cannot be extended any further, so stop the search (while) loop
break
#make sure nodes with no edges don't prematurely halt the search
if len(search_space) == 0:
node_searched = True
region_hi = curr_node.asm.high()
if node_searched: #indicates that region_hi was extended into this node during the search
#searched_nodes[0] will always be the .prev of the node containing region_lo, and searched_nodes[-1] will always be the .next
#of the node containing region_hi; one or both could potentially be NoneType, allowing for easy identification of scaffold-spanning
#chunk regions later
curr_node = curr_node.next
searched_nodes.append(curr_node)
else:
#in this case, region_hi was not extended into this node, so this should be searched_nodes[-1]; it would have already
#been appended in the previous iteration of this while loop, so there is no work to be done and the search is complete
break
#attempt to extend region_lo by searching to the left
curr_node = bad_node
continue_search = True
#want to be able to efficiently add to the beginning of this list; still want efficient random access, so no deque (TODO- is random access necessary?)
searched_nodes.reverse()
first_iteration = True
while curr_node is not None and continue_search:
node_searched = False
if first_iteration:
search_space = initial_left_list
node_searched = True
first_iteration = False
else:
search_space = curr_node.get_sorted_edges()
#otherwise this would always choose search_space[0] due to ordering
search_space.reverse()
for edge in search_space:
if (edge.weight != -10 or (edge.opposite_node(curr_node).asm.name == other_name) ) and (edge.edge_low(curr_node) < region_lo):
region_lo = edge.edge_low(curr_node)
node_searched = True
edge_list.add(edge)
else:
continue_search = False
break
if len(search_space) == 0:
node_searched = True
region_lo = curr_node.asm.low()
if node_searched:
curr_node = curr_node.prev
searched_nodes.append(curr_node)
else:
break
searched_nodes.reverse() #back to normal
for node in searched_nodes:
if node is not None:
assert node.next is not node
assert node.prev is not node
return (searched_nodes, region_lo, region_hi, edge_list)
'''
###################### BEGIN MAIN PORTION OF SCRIPT ##############################
'''
tracker1 = 0
tracker2 = 0
placement = False
while bad_edges: #run as long as bad_edges is not empty
print(len(bad_edges))
#technically, seed_edge and bad_node are not needed in the script's current form, but for legacy and ease of
#use during development, they are defined here. May remove in cleanup down the line.
seed_edge = bad_edges[0]
region1 = find_chunk_region(seed_edge, seed_edge.node1, seed_edge.node2)
region2 = find_chunk_region(seed_edge, seed_edge.node2, seed_edge.node1)
if (region1[2] - region1[1]) > (region2[2] - region2[1]):
bad_node = seed_edge.node1
other_node = seed_edge.node2
region = region1
tracker1 += 1
else:
bad_node = seed_edge.node2
other_node = seed_edge.node1
region = region2
tracker2 += 1
'''
#for some reason this code does not produce errors:
# bad_node = seed_edge.node1
# other_node = seed_edge.node2
# region = region1
#but this does?
bad_node = seed_edge.node2
other_node = seed_edge.node1
region = region2
'''
insert_left = other_node.prev
searched_nodes = region[0]
region_lo = region[1]
region_hi = region[2]
edge_list = region[3]
assert(len(searched_nodes) > 2)
print(len(searched_nodes) - 2)
left_node_exists = region_lo - searched_nodes[1].asm.low() > 1
right_node_exists = region_hi - searched_nodes[-2].asm.high() > 1
assert( searched_nodes[1].asm.low() <= region_lo <= searched_nodes[1].asm.high() )
assert( searched_nodes[-2].asm.low() <= region_hi <= searched_nodes[-2].asm.high() )
'''
################# begin construction of possible new nodes at either end of the region ###################
'''
edges_to_remove = []
chunk_len = (region_hi - region_lo) + 1
if left_node_exists:
left_edges = searched_nodes[1].get_sorted_edges()
for edge in left_edges:
assert edge.edge_low(searched_nodes[1]) >= searched_nodes[1].asm.low()
assert edge.edge_high(searched_nodes[1]) <= searched_nodes[1].asm.high()
left_node_edges = []
border_edges = []
left_chunk_edges = []
for edge in left_edges:
if edge.edge_low(searched_nodes[1]) < region_lo:
if edge.edge_high(searched_nodes[1]) < region_lo:
left_node_edges.append(edge)
else:
border_edges.append(edge)
else:
left_chunk_edges.append(edge)
edges_to_remove.extend(border_edges)
# edges_to_remove.extend(left_chunk_edges) if right_node exists, this will accidentally remove bad edges from right_node, but they should remain there
for edge in border_edges:
edge.destroy()
#TODO may want to refactor this to using internal node methods instead of explicitly setting values like this
searched_nodes[1]._edges = left_chunk_edges
searched_nodes[1]._edges_sorted = False
debug_temp = searched_nodes[1].asm.low()
left_trim_dist = region_lo - searched_nodes[1].asm.low()
left_ref_CL = searched_nodes[1].ref.trim_lo(left_trim_dist)
left_asm_CL = searched_nodes[1].asm.trim_lo(left_trim_dist)
assert( left_asm_CL.low() == debug_temp )
assert( left_asm_CL.high() == region_lo - 1 )
assert( searched_nodes[1].asm.low() == region_lo )
left_asm_og_CL = searched_nodes[1].asm_original.trim_lo(left_trim_dist)
left_node = Node(-1, left_ref_CL, left_asm_CL, left_asm_og_CL, left_node_edges)
for edge in left_node_edges:
#searched_nodes[1] because ownership of these edges has not been transferred from this to left_node yet
assert( edge.edge_low(searched_nodes[1]) >= left_node.asm.low())
assert( edge.edge_high(searched_nodes[1]) <= left_node.asm.high())
left_seq = searched_nodes[1].seq[:left_trim_dist]
left_node.seq = left_seq
assert len(left_seq) == len(left_asm_CL)
searched_nodes[1].seq = searched_nodes[1].seq[left_trim_dist:]
assert len(searched_nodes[1].seq) == len(searched_nodes[1].asm)
checker = searched_nodes[1]
for edge in checker.get_edges():
assert checker.asm.low() <= edge.edge_low(checker) < edge.edge_high(checker) <= checker.asm.high()
if right_node_exists:
right_edges = searched_nodes[-2].get_sorted_edges()
right_node_edges = []
border_edges2 = []
right_chunk_edges = []
for edge in right_edges:
if edge.edge_low(searched_nodes[-2]) <= region_hi:
if edge.edge_high(searched_nodes[-2]) <= region_hi:
right_chunk_edges.append(edge)
else:
border_edges2.append(edge)
else:
right_node_edges.append(edge)
edges_to_remove.extend(border_edges2)
# edges_to_remove.extend(right_chunk_edges) safer than the call that extends by left_chunk_edges (see comment there), but removing anyway to rewrite
#all of the chunk region edge removal logic
for edge in border_edges2:
edge.destroy()
'''
right_chunk_edges = set(searched_nodes[-2].get_edges())
right_chunk_edges.difference_update(right_edges)
'''
searched_nodes[-2]._edges = right_chunk_edges
searched_nodes[-2]._edges_sorted = False
debug_temp2 = searched_nodes[-2].asm.high()
right_trim_dist = searched_nodes[-2].asm.high() - region_hi
right_ref_CL = searched_nodes[-2].ref.trim_hi(right_trim_dist)
right_asm_CL = searched_nodes[-2].asm.trim_hi(right_trim_dist)
assert( right_asm_CL.low() == region_hi )
assert( right_asm_CL.high() == debug_temp2 )
debug_temp = len(right_asm_CL)
right_asm_CL.left = region_lo
right_asm_CL.right = searched_nodes[-2].asm.high() - chunk_len
assert( debug_temp == len(right_asm_CL) )
right_asm_og_CL = searched_nodes[-2].asm_original.trim_hi(right_trim_dist)
right_node = Node(-1, right_ref_CL, right_asm_CL, right_asm_og_CL, right_node_edges)
for edge in right_node_edges:
assert( edge.edge_low(searched_nodes[-2]) >= right_node.asm.low() )
assert( edge.edge_high(searched_nodes[-2]) <= right_node.asm.high() )
right_seq = searched_nodes[-2].seq[(right_trim_dist + 1):]
right_node.seq = right_seq
assert len(right_seq) == len(right_asm_CL)
searched_nodes[-2].seq = searched_nodes[-2].seq[:(right_trim_dist + 1)]
assert len(searched_nodes[-2].seq) == len(searched_nodes[-2].asm)
checker = searched_nodes[-2]
for edge in checker.get_edges():
assert checker.asm.low() <= edge.edge_low(checker) < edge.edge_high(checker) <= checker.asm.high()
############### setting/updating prev and next for all affected nodes ################
#logic explained in pic from 7/5/18
if left_node_exists:
leftmost = left_node
elif right_node_exists:
leftmost = right_node
else:
leftmost = searched_nodes[-1]
if right_node_exists:
rightmost = right_node
elif left_node_exists:
rightmost = left_node
else:
rightmost = searched_nodes[0]
for node in searched_nodes:
if node is not None:
assert node.next is not node
assert node.prev is not node
if leftmost is not None:
leftmost.prev = searched_nodes[0]
if rightmost is not None:
rightmost.next = searched_nodes[-1]
if searched_nodes[0] is not None:
searched_nodes[0].next = leftmost
if searched_nodes[-1] is not None:
searched_nodes[-1].prev = rightmost
if leftmost is not None and rightmost is not None and leftmost.asm.left < rightmost.asm.left:
leftmost.next = rightmost
rightmost.prev = leftmost
searched_nodes[1].prev = insert_left
if insert_left is not None:
insert_left.next = searched_nodes[1]
searched_nodes[-2].next = other_node
other_node.prev = searched_nodes[-2]
for node in searched_nodes:
if node is not None:
assert node.next is not node
assert node.prev is not node
chunk_node = searched_nodes[1]
while chunk_node is not other_node:
edges = chunk_node.get_edges()
for edge in edges:
if edge.weight == -10:
edge_list.add(edge)
chunk_node = chunk_node.next
edge_list.update(edges_to_remove)
b_e_temp_set = set(bad_edges)
b_e_temp_set.difference_update(edge_list) #anything in edge_list that's also in b_e will be removed from b_e
bad_edges = list(b_e_temp_set)
#leaving this line in so the script will give consistent results; may allow user to toggle this in later builds
# bad_edges.sort( key = lambda e : e.edge_low(e.node1) ) #adding to make this stable/make the overall algorithm non-deterministic
#update $scaffolds
if searched_nodes[0] is None:
if searched_nodes[-1] is None and not left_node_exists and not right_node_exists:
#entire scaffold was placed within another
scaffolds.remove(searched_nodes[1])
placement = True
else:
#remove old head/add new one
for index, head in enumerate(scaffolds):
if head is searched_nodes[1]:
scaffolds[index] = leftmost
break
else:
assert(1==2)
if insert_left is None:
#remove old head/add new one
for index, head in enumerate(scaffolds):
if head is other_node:
scaffolds[index] = searched_nodes[1]
break
else:
assert(1==2)
#update edge endpoints (just nodes, not coordinates)
if left_node_exists:
left_node.new_edge_endpoints(searched_nodes[1])
if right_node_exists:
right_node.new_edge_endpoints(searched_nodes[-2])
chunk_node = searched_nodes[1]
shift_dist = other_node.asm.low() - chunk_node.asm.low()
new_name = other_node.asm.name
while chunk_node is not other_node:
chunk_node.shift(shift_dist)
chunk_node.asm.name = new_name
for edge in chunk_node.get_edges():
if edge.node1 is chunk_node:
edge.asm1.name = new_name
elif edge.node2 is chunk_node:
edge.asm2.name = new_name
else:
assert 10 == 20
assert edge.edge_low(chunk_node) >= chunk_node.asm.low()
assert edge.edge_high(chunk_node) <= chunk_node.asm.high()
chunk_node = chunk_node.next
shift_distance = (searched_nodes[-2].asm.high() - other_node.asm.low()) + 1
new_right_node = other_node
while new_right_node is not None:
new_right_node.shift(shift_distance)
for edge in new_right_node.get_edges():
assert edge.edge_low(new_right_node) >= new_right_node.asm.low()
assert edge.edge_high(new_right_node) <= new_right_node.asm.high()
new_right_node = new_right_node.next
if right_node_exists:
if right_node.prev is not None:
shift_dist = (right_node.prev.asm.high() + 1) - right_node.asm.low()
else:
shift_dist = -(right_node.asm.low() - 1) #want right node to start at 1 since its a scaffold head in this case
right_node.shift_edges(shift_dist)
for edge in right_node.get_edges():
assert edge.edge_low(right_node) >= right_node.asm.low()
assert edge.edge_high(right_node) <= right_node.asm.high()
if rightmost is not None:
pre_iter = rightmost.next
elif searched_nodes[-1] is not None:
pre_iter = searched_nodes[-1]
else:
pre_iter = None
if pre_iter is not None:
if pre_iter.prev is not None:
shift_dist = (pre_iter.prev.asm.high() + 1) - pre_iter.asm.low()
else:
shift_dist = -(pre_iter.asm.low() - 1) #want right node to start at 1 since its a scaffold head in this case
while pre_iter is not None:
pre_iter.shift(shift_dist)
for edge in pre_iter.get_edges():
assert edge.edge_low(pre_iter) >= pre_iter.asm.low()
assert edge.edge_high(pre_iter) <= pre_iter.asm.high()
pre_iter = pre_iter.next
for head in scaffolds:
while head is not None:
for edge in head.get_edges():
assert edge.edge_low(head) >= head.asm.low()
assert edge.edge_high(head) <= head.asm.high()
head = head.next
print tracker1, tracker2
if placement:
print("WE DID IT")
with open("placed2.txt", "w") as o_f:
for head in scaffolds:
while head is not None:
o_f.write(str(head))
o_f.write("\n")
head = head.next
o_f.write("\n")
with open("placed2.fasta", "w") as fasta:
total = str(len(scaffolds))
for num, head in enumerate(scaffolds):
print(str(num) + "/" + total)
curr_seq = ""
seq_name = ""
while head is not None:
seq_name = head.asm.name
curr_seq += head.seq
head = head.next
fasta.write(">" + seq_name)
fasta.write("\n")
print("writing fasta seq for this scaffold")
print(len(curr_seq))
loop_num = 0
denom = len(curr_seq)/80.0
while len(curr_seq) > 80:
fasta.write(curr_seq[:80])
fasta.write("\n")
curr_seq = curr_seq[80:]
loop_num += 1
if loop_num % 1500 == 0:
print( str( (loop_num / denom)*100 ) + "%")
fasta.write(curr_seq)
fasta.write("\n")
fasta.write("\n")