Don't complain about low coverage on key positions for #412.

Rerun resistance interpretation on multiple run folders.
cfe-lab · Feb 1, 2018 · 736020d · 736020d
1 parent bbc10f2
commit 736020d
Show file tree

Hide file tree

Showing 2 changed files with 169 additions and 39 deletions.
diff --git a/micall/hivdb/hivdb.py b/micall/hivdb/hivdb.py
@@ -321,28 +321,28 @@ def write_resistance(aminos, resistance_csv, mutations_csv, algorithms=None):
                     amino = mutation.variant
                     pos = mutation.pos
                     pos_aminos = amino_seq[pos-1]
-                    prevalence = pos_aminos[amino]
+                    prevalence = pos_aminos.get(amino, 0)
                     mutations_writer.writerow(dict(drug_class=drug_class,
                                                    mutation=mutation,
                                                    prevalence=prevalence,
                                                    genotype=genotype))
 
 
 def interpret(asi, amino_seq, region):
+    ref_seq = asi.stds[region]
     # TODO: Make this more general instead of only applying to missing MIDI.
-    is_missing_midi = False
-    if region.endswith('-NS5b'):
-        ref_seq = asi.stds[region]
-        if len(amino_seq) < len(ref_seq):
-            # Missing MIDI, assume wild type and look for known resistance.
-            is_missing_midi = True
-            new_amino_seq = []
-            for wild_type, old_aminos in zip_longest(ref_seq, amino_seq):
-                if old_aminos is not None:
-                    new_amino_seq.append(old_aminos)
-                else:
-                    new_amino_seq.append({wild_type: 1.0})
-            amino_seq = new_amino_seq
+    is_missing_midi = region.endswith('-NS5b') and len(amino_seq) < len(ref_seq)
+
+    # TODO: Put back the check that all mutation positions have minimum coverage.
+    if any(amino_seq):
+        # At least some data found, assume wild type in gaps.
+        new_amino_seq = []
+        for wild_type, old_aminos in zip_longest(ref_seq, amino_seq):
+            if old_aminos:
+                new_amino_seq.append(old_aminos)
+            else:
+                new_amino_seq.append({wild_type: 1.0})
+        amino_seq = new_amino_seq
 
     result = asi.interpret(amino_seq, region)
     if not is_missing_midi:

diff --git a/micall/utils/genreport_rerun.py b/micall/utils/genreport_rerun.py
@@ -1,30 +1,57 @@
 import csv
 import os
-from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
+import shutil
+from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, FileType
 from collections import namedtuple
 from csv import DictReader, DictWriter
+from glob import glob
 from itertools import groupby
 from operator import itemgetter
 
+from datetime import datetime
+
 from micall.hivdb.genreport import gen_report
 from micall.hivdb.hivdb import hivdb
+from micall.settings import NEEDS_PROCESSING, pipeline_version, DONE_PROCESSING
 from micall.utils.sample_sheet_parser import sample_sheet_parser
 
 SampleGroup = namedtuple('SampleGroup', 'enum names')
 
 
 def parse_args():
     parser = ArgumentParser(
-        description='Rerun resistance interpretations on a run folder.',
+        description='Rerun resistance interpretations.',
         formatter_class=ArgumentDefaultsHelpFormatter)
 
-    parser.add_argument('--source',
-                        '-s',
-                        help='source results folder')
-    parser.add_argument('--working',
-                        '-w',
-                        help='working folder')
-    return parser.parse_args()
+    parser.add_argument(
+        '-r',
+        '--results',
+        help='source results folder in one run')
+    parser.add_argument(
+        '-w',
+        '--working',
+        help='working folder')
+    parser.add_argument(
+        '-d',
+        '--raw_data',
+        help='Main RAW_DATA folder to search for runs.')
+    parser.add_argument(
+        '-s',
+        '--samples_csv',
+        type=FileType(),
+        help='CSV file with sample and run names.')
+    parser.add_argument(
+        '-m',
+        '--min_run_name',
+        help='Select all later runs, (e.g., "161201").')
+    args = parser.parse_args()
+    source_names = ('results', 'samples_csv', 'min_run_name')
+    source_count = sum(1 for name in source_names if getattr(args, name) is not None)
+    if source_count != 1:
+        parser.error('Must provide one of --results, --samples_csv, or --min_run_name.')
+    if args.results is None and args.raw_data is None:
+        parser.error('Must provide --raw_data if not using --results.')
+    return args
 
 
 def find_groups(working_paths, source_path):
@@ -44,7 +71,14 @@ def find_groups(working_paths, source_path):
         if sample_name is None:
             # Not an HCV sample.
             continue
+        if not os.path.exists(path):
+            # No results
+            continue
         midi_file = midi_files.get(sample_name + 'MIDI')
+        if midi_file and not os.path.exists(os.path.join(os.path.dirname(path),
+                                                         midi_file)):
+            # No MIDI results
+            midi_file = None
         yield SampleGroup(sample_name, (wide_file, midi_file))
 
 
@@ -61,18 +95,28 @@ def rewrite_file(filename):
                 writer.writerow(row)
 
 
-def main():
-    args = parse_args()
-    working_paths = split_files(args)
-
+def genreport_rerun(source, working):
+    run_path = os.path.dirname(os.path.dirname(source))
+    run_name = os.path.basename(run_path)
+    publish_path = os.path.join(working,
+                                'rerun_results',
+                                run_name)
+    os.makedirs(publish_path)
+    print('##', run_name)
+    working_paths = split_files(source, working)
     sorted_working_paths = sorted(working_paths)
-    groups = list(find_groups(sorted_working_paths, args.source))
+    groups = list(find_groups(sorted_working_paths, source))
     for group in groups:
-        working_path = os.path.join(args.working, group.names[0])
-        midi_path = working_path if group.names[1] is None else os.path.join(
-            args.working,
-            group.names[1])
-        print(working_path)
+        working_path = os.path.join(working, group.names[0])
+        if group.names[1] is None:
+            midi_name = ''
+            midi_path = working_path
+        else:
+            midi_name = group.names[1]
+            midi_path = os.path.join(
+                working,
+                group.names[1])
+        print(working_path, midi_name)
         with open(os.path.join(working_path, 'amino.csv')) as amino_csv, \
                 open(os.path.join(midi_path, 'amino.csv')) as midi_amino_csv, \
                 open(os.path.join(working_path, 'resistance.csv'), 'w') as resistance_csv, \
@@ -91,12 +135,11 @@ def main():
                        mutations_csv,
                        resistance_report_csv,
                        sample_name=sample_name)
-
     for file_name in ('resistance.csv', 'mutations.csv', 'resistance_fail.csv'):
-        with open(os.path.join(args.working, file_name), 'w') as dest:
+        with open(os.path.join(publish_path, file_name), 'w') as dest:
             dest_writer = csv.writer(dest)
             for i, group in enumerate(groups):
-                working_path = os.path.join(args.working, group.names[0])
+                working_path = os.path.join(working, group.names[0])
                 sample_name = os.path.basename(working_path)
                 with open(os.path.join(working_path, file_name), 'r') as source:
                     source_reader = csv.reader(source)
@@ -110,14 +153,14 @@ def main():
                         dest_writer.writerow(row)
 
 
-def split_files(args):
+def split_files(source, working):
     working_paths = set()
     file_name = 'amino.csv'
-    file_path = os.path.join(args.source, file_name)
+    file_path = os.path.join(source, file_name)
     with open(file_path) as f:
         reader = DictReader(f)
         for sample, rows in groupby(reader, itemgetter('sample')):
-            working_path = os.path.join(args.working, sample)
+            working_path = os.path.join(working, sample)
             working_paths.add(working_path)
             if __name__ == '__live_coding__':
                 if len(working_paths) > 20:
@@ -135,4 +178,91 @@ def split_files(args):
     return working_paths
 
 
+def find_recent_results(min_run_name, source_folder):
+    pattern = os.path.join(source_folder, 'MiSeq', 'runs', '*')
+    matches = glob(pattern)
+    folder_names = [os.path.basename(match) for match in matches]
+    recent_folders = sorted(os.path.join(source_folder,
+                                         'MiSeq',
+                                         'runs',
+                                         folder,
+                                         'Results',
+                                         'version_' + pipeline_version)
+                            for folder in folder_names
+                            if folder >= min_run_name and folder[0].isnumeric())
+    folders_with_data = [
+        folder
+        for folder in recent_folders
+        if os.path.exists(os.path.join(folder, DONE_PROCESSING))]
+    return folders_with_data
+
+
+def find_sample_results(samples_csv, source_folder):
+    run_names = {row['run'] for row in DictReader(samples_csv)}
+    run_paths = []
+    for run_name in run_names:
+        run_path = os.path.join(source_folder, 'MiSeq', 'runs', run_name)
+        if os.path.exists(run_path):
+            run_paths.append(run_path)
+        else:
+            run_parts = run_name.split('.')
+            if len(run_parts) < 2:
+                run_parts.append('M01841')
+            format_string = '%d-%b-%y' if len(run_parts[0]) == 9 else '%d-%b-%Y'
+            run_date = datetime.strptime(run_parts[0], format_string)
+            base_run_name = run_date.strftime('%y%m%d') + '_' + run_parts[1]
+            pattern = os.path.join(source_folder,
+                                   'MiSeq',
+                                   'runs',
+                                   base_run_name+'*',
+                                   NEEDS_PROCESSING)
+            matches = glob(pattern)
+            if len(matches) != 1:
+                raise RuntimeError('Expected one match for {}, but found: {}'.format(
+                    pattern,
+                    matches))
+            run_paths.append(os.path.dirname(matches[0]))
+    run_paths.sort()
+    results_folders = []
+    for run_path in run_paths:
+        done_processing_file = os.path.join(source_folder,
+                                            'MiSeq',
+                                            'runs',
+                                            run_path,
+                                            'Results',
+                                            'version_' + pipeline_version,
+                                            DONE_PROCESSING)
+        if os.path.exists(done_processing_file):
+            results_folders.append(os.path.dirname(done_processing_file))
+        else:
+            print('!! Results not found:', done_processing_file)
+    return results_folders
+
+
+def clear_working_folder(working):
+    for f in os.listdir(working):
+        if f != 'rerun_results':
+            file_path = os.path.join(working, f)
+            if os.path.isdir(file_path):
+                shutil.rmtree(file_path)
+            else:
+                os.remove(file_path)
+
+
+def main():
+    args = parse_args()
+    if args.results is not None:
+        results_folders = [args.results]
+    elif args.min_run_name:
+        results_folders = find_recent_results(args.min_run_name, args.raw_data)
+    else:
+        results_folders = find_sample_results(args.samples_csv, args.raw_data)
+    rerun_results_path = os.path.join(args.working, 'rerun_results')
+    shutil.rmtree(rerun_results_path, ignore_errors=True)
+    for results_folder in results_folders:
+        clear_working_folder(args.working)
+        genreport_rerun(results_folder, args.working)
+    clear_working_folder(args.working)
+
+
 main()