Skip to content

Commit

Permalink
Fix #470 by overriding MIDI subtypes based on main sample's subtype.
Browse files Browse the repository at this point in the history
  • Loading branch information
donkirkby committed Jun 18, 2019
1 parent ee8e707 commit 607acfa
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 7 deletions.
17 changes: 10 additions & 7 deletions micall/resistance/resistance.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,9 @@ def combine_aminos(amino_csv, midi_amino_csv, failures):
except LowCoverageError as ex:
failures[(seed, region, is_midi)] = ex.args[0]
continue
midi_rows[seed] = [row
for row in rows
if 226 < int(row['refseq.aa.pos'])]
midi_rows[get_genotype(seed)] = [row
for row in rows
if 226 < int(row['refseq.aa.pos'])]
is_midi = False
for (seed, region), rows in groupby(DictReader(amino_csv),
itemgetter('seed', 'region')):
Expand All @@ -157,7 +157,8 @@ def combine_aminos(amino_csv, midi_amino_csv, failures):
main_rows = []
low_coverage_message = ex.args[0]
if region.endswith('-NS5b'):
region_midi_rows = midi_rows.pop(seed, None)
genotype = get_genotype(seed)
region_midi_rows = midi_rows.pop(genotype, None)
if region_midi_rows is None:
region_midi_rows = all_rows
if is_midi_separate:
Expand All @@ -171,14 +172,15 @@ def combine_aminos(amino_csv, midi_amino_csv, failures):
low_coverage_message = None
except LowCoverageError:
region_midi_rows = []
main_rows = combine_midi_rows(main_rows, region_midi_rows)
main_rows = combine_midi_rows(main_rows, region_midi_rows, seed)
if low_coverage_message:
failures[(seed, region, is_midi)] = low_coverage_message
yield from main_rows

# Check for MIDI regions that had no match.
for seed, rows in sorted(midi_rows.items()):
for genotype, rows in sorted(midi_rows.items()):
region = rows[0]['region']
seed = rows[0]['seed']
failures.setdefault((seed, region, False), NOTHING_MAPPED_MESSAGE)
yield from rows

Expand All @@ -193,7 +195,7 @@ def write_failure(fail_writer, seed, region, reason):
reason=reason))


def combine_midi_rows(main_rows, midi_rows):
def combine_midi_rows(main_rows, midi_rows, seed):
main_row_map = {int(row['refseq.aa.pos']): row
for row in main_rows}
midi_row_map = {int(row['refseq.aa.pos']): row
Expand All @@ -208,6 +210,7 @@ def combine_midi_rows(main_rows, midi_rows):
if pos <= 336:
yield main_row
elif main_row is None:
midi_row['seed'] = seed # Override MIDI seed with main seed.
yield midi_row
elif (pos <= 336 and
int(main_row['coverage']) > int(midi_row['coverage'])):
Expand Down
34 changes: 34 additions & 0 deletions micall/tests/test_resistance.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,40 @@ def test_combine_aminos_ns5b_multiple_seeds():
check_combination(amino_csv, midi_amino_csv, expected_csv, expected_failures)


def test_combine_aminos_ns5b_multiple_subtypes():
""" Main sample's subtype overrides MIDI subtype when they disagree. """
amino_csv = StringIO("""\
seed,region,q-cutoff,query.nuc.pos,refseq.aa.pos,\
A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*,X,partial,del,ins,clip,g2p_overlap,coverage
HCV-4c,HCV4-ED43-NS5b,15,1,1,0,0,0,0,0,0,0,0,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000
HCV-4c,HCV4-ED43-NS5b,15,4,2,0,0,0,0,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000
HCV-4c,HCV4-ED43-NS5b,15,7,228,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000
""")
amino_csv.name = 'main_amino.csv'
midi_amino_csv = StringIO("""\
seed,region,q-cutoff,query.nuc.pos,refseq.aa.pos,\
A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*,X,partial,del,ins,clip,g2p_overlap,coverage
HCV-4m,HCV4-ED43-NS5b,15,1,558,0,0,0,0,0,0,0,0,20000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20000
HCV-4m,HCV4-ED43-NS5b,15,1,559,0,0,0,0,0,0,0,0,20000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20000
HCV-4m,HCV4-ED43-NS5b,15,4,560,0,0,0,0,20000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20000
HCV-4m,HCV4-ED43-NS5b,15,7,561,20000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20000
""")
midi_amino_csv.name = 'midi_amino.csv'
expected_csv = """\
seed,region,q-cutoff,query.nuc.pos,refseq.aa.pos,\
A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*,X,partial,del,ins,clip,g2p_overlap,coverage
HCV-4c,HCV4-ED43-NS5b,15,1,1,0,0,0,0,0,0,0,0,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000
HCV-4c,HCV4-ED43-NS5b,15,4,2,0,0,0,0,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000
HCV-4c,HCV4-ED43-NS5b,15,7,228,10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10000
HCV-4c,HCV4-ED43-NS5b,15,1,558,0,0,0,0,0,0,0,0,20000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20000
HCV-4c,HCV4-ED43-NS5b,15,1,559,0,0,0,0,0,0,0,0,20000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20000
HCV-4c,HCV4-ED43-NS5b,15,4,560,0,0,0,0,20000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20000
HCV-4c,HCV4-ED43-NS5b,15,7,561,20000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20000
"""

check_combination(amino_csv, midi_amino_csv, expected_csv)


def test_combine_aminos_low_coverage_in_midi():
amino_csv = StringIO("""\
seed,region,q-cutoff,query.nuc.pos,refseq.aa.pos,\
Expand Down

0 comments on commit 607acfa

Please sign in to comment.