Skip to content

Commit

Permalink
Don't complain about low coverage on key positions for #412.
Browse files Browse the repository at this point in the history
Rerun resistance interpretation on multiple run folders.
  • Loading branch information
donkirkby committed Feb 1, 2018
1 parent bbc10f2 commit 736020d
Show file tree
Hide file tree
Showing 2 changed files with 169 additions and 39 deletions.
28 changes: 14 additions & 14 deletions micall/hivdb/hivdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,28 +321,28 @@ def write_resistance(aminos, resistance_csv, mutations_csv, algorithms=None):
amino = mutation.variant
pos = mutation.pos
pos_aminos = amino_seq[pos-1]
prevalence = pos_aminos[amino]
prevalence = pos_aminos.get(amino, 0)
mutations_writer.writerow(dict(drug_class=drug_class,
mutation=mutation,
prevalence=prevalence,
genotype=genotype))


def interpret(asi, amino_seq, region):
ref_seq = asi.stds[region]
# TODO: Make this more general instead of only applying to missing MIDI.
is_missing_midi = False
if region.endswith('-NS5b'):
ref_seq = asi.stds[region]
if len(amino_seq) < len(ref_seq):
# Missing MIDI, assume wild type and look for known resistance.
is_missing_midi = True
new_amino_seq = []
for wild_type, old_aminos in zip_longest(ref_seq, amino_seq):
if old_aminos is not None:
new_amino_seq.append(old_aminos)
else:
new_amino_seq.append({wild_type: 1.0})
amino_seq = new_amino_seq
is_missing_midi = region.endswith('-NS5b') and len(amino_seq) < len(ref_seq)

# TODO: Put back the check that all mutation positions have minimum coverage.
if any(amino_seq):
# At least some data found, assume wild type in gaps.
new_amino_seq = []
for wild_type, old_aminos in zip_longest(ref_seq, amino_seq):
if old_aminos:
new_amino_seq.append(old_aminos)
else:
new_amino_seq.append({wild_type: 1.0})
amino_seq = new_amino_seq

result = asi.interpret(amino_seq, region)
if not is_missing_midi:
Expand Down
180 changes: 155 additions & 25 deletions micall/utils/genreport_rerun.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,57 @@
import csv
import os
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
import shutil
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, FileType
from collections import namedtuple
from csv import DictReader, DictWriter
from glob import glob
from itertools import groupby
from operator import itemgetter

from datetime import datetime

from micall.hivdb.genreport import gen_report
from micall.hivdb.hivdb import hivdb
from micall.settings import NEEDS_PROCESSING, pipeline_version, DONE_PROCESSING
from micall.utils.sample_sheet_parser import sample_sheet_parser

SampleGroup = namedtuple('SampleGroup', 'enum names')


def parse_args():
parser = ArgumentParser(
description='Rerun resistance interpretations on a run folder.',
description='Rerun resistance interpretations.',
formatter_class=ArgumentDefaultsHelpFormatter)

parser.add_argument('--source',
'-s',
help='source results folder')
parser.add_argument('--working',
'-w',
help='working folder')
return parser.parse_args()
parser.add_argument(
'-r',
'--results',
help='source results folder in one run')
parser.add_argument(
'-w',
'--working',
help='working folder')
parser.add_argument(
'-d',
'--raw_data',
help='Main RAW_DATA folder to search for runs.')
parser.add_argument(
'-s',
'--samples_csv',
type=FileType(),
help='CSV file with sample and run names.')
parser.add_argument(
'-m',
'--min_run_name',
help='Select all later runs, (e.g., "161201").')
args = parser.parse_args()
source_names = ('results', 'samples_csv', 'min_run_name')
source_count = sum(1 for name in source_names if getattr(args, name) is not None)
if source_count != 1:
parser.error('Must provide one of --results, --samples_csv, or --min_run_name.')
if args.results is None and args.raw_data is None:
parser.error('Must provide --raw_data if not using --results.')
return args


def find_groups(working_paths, source_path):
Expand All @@ -44,7 +71,14 @@ def find_groups(working_paths, source_path):
if sample_name is None:
# Not an HCV sample.
continue
if not os.path.exists(path):
# No results
continue
midi_file = midi_files.get(sample_name + 'MIDI')
if midi_file and not os.path.exists(os.path.join(os.path.dirname(path),
midi_file)):
# No MIDI results
midi_file = None
yield SampleGroup(sample_name, (wide_file, midi_file))


Expand All @@ -61,18 +95,28 @@ def rewrite_file(filename):
writer.writerow(row)


def main():
args = parse_args()
working_paths = split_files(args)

def genreport_rerun(source, working):
run_path = os.path.dirname(os.path.dirname(source))
run_name = os.path.basename(run_path)
publish_path = os.path.join(working,
'rerun_results',
run_name)
os.makedirs(publish_path)
print('##', run_name)
working_paths = split_files(source, working)
sorted_working_paths = sorted(working_paths)
groups = list(find_groups(sorted_working_paths, args.source))
groups = list(find_groups(sorted_working_paths, source))
for group in groups:
working_path = os.path.join(args.working, group.names[0])
midi_path = working_path if group.names[1] is None else os.path.join(
args.working,
group.names[1])
print(working_path)
working_path = os.path.join(working, group.names[0])
if group.names[1] is None:
midi_name = ''
midi_path = working_path
else:
midi_name = group.names[1]
midi_path = os.path.join(
working,
group.names[1])
print(working_path, midi_name)
with open(os.path.join(working_path, 'amino.csv')) as amino_csv, \
open(os.path.join(midi_path, 'amino.csv')) as midi_amino_csv, \
open(os.path.join(working_path, 'resistance.csv'), 'w') as resistance_csv, \
Expand All @@ -91,12 +135,11 @@ def main():
mutations_csv,
resistance_report_csv,
sample_name=sample_name)

for file_name in ('resistance.csv', 'mutations.csv', 'resistance_fail.csv'):
with open(os.path.join(args.working, file_name), 'w') as dest:
with open(os.path.join(publish_path, file_name), 'w') as dest:
dest_writer = csv.writer(dest)
for i, group in enumerate(groups):
working_path = os.path.join(args.working, group.names[0])
working_path = os.path.join(working, group.names[0])
sample_name = os.path.basename(working_path)
with open(os.path.join(working_path, file_name), 'r') as source:
source_reader = csv.reader(source)
Expand All @@ -110,14 +153,14 @@ def main():
dest_writer.writerow(row)


def split_files(args):
def split_files(source, working):
working_paths = set()
file_name = 'amino.csv'
file_path = os.path.join(args.source, file_name)
file_path = os.path.join(source, file_name)
with open(file_path) as f:
reader = DictReader(f)
for sample, rows in groupby(reader, itemgetter('sample')):
working_path = os.path.join(args.working, sample)
working_path = os.path.join(working, sample)
working_paths.add(working_path)
if __name__ == '__live_coding__':
if len(working_paths) > 20:
Expand All @@ -135,4 +178,91 @@ def split_files(args):
return working_paths


def find_recent_results(min_run_name, source_folder):
pattern = os.path.join(source_folder, 'MiSeq', 'runs', '*')
matches = glob(pattern)
folder_names = [os.path.basename(match) for match in matches]
recent_folders = sorted(os.path.join(source_folder,
'MiSeq',
'runs',
folder,
'Results',
'version_' + pipeline_version)
for folder in folder_names
if folder >= min_run_name and folder[0].isnumeric())
folders_with_data = [
folder
for folder in recent_folders
if os.path.exists(os.path.join(folder, DONE_PROCESSING))]
return folders_with_data


def find_sample_results(samples_csv, source_folder):
run_names = {row['run'] for row in DictReader(samples_csv)}
run_paths = []
for run_name in run_names:
run_path = os.path.join(source_folder, 'MiSeq', 'runs', run_name)
if os.path.exists(run_path):
run_paths.append(run_path)
else:
run_parts = run_name.split('.')
if len(run_parts) < 2:
run_parts.append('M01841')
format_string = '%d-%b-%y' if len(run_parts[0]) == 9 else '%d-%b-%Y'
run_date = datetime.strptime(run_parts[0], format_string)
base_run_name = run_date.strftime('%y%m%d') + '_' + run_parts[1]
pattern = os.path.join(source_folder,
'MiSeq',
'runs',
base_run_name+'*',
NEEDS_PROCESSING)
matches = glob(pattern)
if len(matches) != 1:
raise RuntimeError('Expected one match for {}, but found: {}'.format(
pattern,
matches))
run_paths.append(os.path.dirname(matches[0]))
run_paths.sort()
results_folders = []
for run_path in run_paths:
done_processing_file = os.path.join(source_folder,
'MiSeq',
'runs',
run_path,
'Results',
'version_' + pipeline_version,
DONE_PROCESSING)
if os.path.exists(done_processing_file):
results_folders.append(os.path.dirname(done_processing_file))
else:
print('!! Results not found:', done_processing_file)
return results_folders


def clear_working_folder(working):
for f in os.listdir(working):
if f != 'rerun_results':
file_path = os.path.join(working, f)
if os.path.isdir(file_path):
shutil.rmtree(file_path)
else:
os.remove(file_path)


def main():
args = parse_args()
if args.results is not None:
results_folders = [args.results]
elif args.min_run_name:
results_folders = find_recent_results(args.min_run_name, args.raw_data)
else:
results_folders = find_sample_results(args.samples_csv, args.raw_data)
rerun_results_path = os.path.join(args.working, 'rerun_results')
shutil.rmtree(rerun_results_path, ignore_errors=True)
for results_folder in results_folders:
clear_working_folder(args.working)
genreport_rerun(results_folder, args.working)
clear_working_folder(args.working)


main()

0 comments on commit 736020d

Please sign in to comment.