[MRG] add generic support for gzipped and zipfile CSVs (#2195)

* add FileInputCSV * support zipfile taxonommies * rename 'default_zip_name' to 'default_csvname' * hack in support for CSVs with version info * more tests, support for ZipFile objs being passed in * sourmash sketch now accepts gzipped CSV files * picklists can now be gzipped * sig check now supports gz input file * add test for sig check with gz picklist * add delimiter support * update comment * fix comment * update comment * update comment * remove SOURMASH-TAXONOMY.csv * add explicit tests for FileOutputCSV * add gzip output tests for compare, plot, gather * change prefetch to use FileOutputCSV * fix gather --save-prefetch-csv to use FileOutputCSV * fix manifests to read/write gzip * add test for sig describe -o csv.gz * more test
sourmash-bio · Aug 15, 2022 · 0c093c1 · 0c093c1
1 parent 010718c
commit 0c093c1
Show file tree

Hide file tree

Showing 15 changed files with 731 additions and 41 deletions.
diff --git a/src/sourmash/command_sketch.py b/src/sourmash/command_sketch.py
@@ -426,9 +426,7 @@ def fromfile(args):
     n_duplicate_name = 0
 
     for csvfile in args.csvs:
-        with open(csvfile, newline="") as fp:
-            r = csv.DictReader(fp)
-
+        with sourmash_args.FileInputCSV(csvfile) as r:
             for row in r:
                 name = row['name']
                 if not name:

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
@@ -723,7 +723,7 @@ def gather(args):
         prefetch_csvout_fp = None
         prefetch_csvout_w = None
         if args.save_prefetch_csv:
-            prefetch_csvout_fp = FileOutput(args.save_prefetch_csv, 'wt').open()
+            prefetch_csvout_fp = FileOutputCSV(args.save_prefetch_csv).open()
 
             query_mh = prefetch_query.minhash
             scaled = query_mh.scaled
@@ -1229,7 +1229,7 @@ def prefetch(args):
     csvout_fp = None
     csvout_w = None
     if args.output:
-        csvout_fp = FileOutput(args.output, 'wt').open()
+        csvout_fp = FileOutputCSV(args.output).open()
 
     # track & maybe save matches progressively
     matches_out = SaveSignaturesToLocation(args.save_matches)

diff --git a/src/sourmash/lca/command_index.py b/src/sourmash/lca/command_index.py
@@ -27,6 +27,8 @@ def load_taxonomy_assignments(filename, *, delimiter=',', start_column=2,
     lineage tuples.
     """
     # parse spreadsheet!
+    # CTB note: can't easily switch to FileInputCSV, because of
+    # janky way we do/don't handle headers here. See issue #2198.
     fp = open(filename, newline='')
     r = csv.reader(fp, delimiter=delimiter)
     row_headers = ['identifiers']

diff --git a/src/sourmash/manifest.py b/src/sourmash/manifest.py
@@ -3,6 +3,7 @@
 """
 import csv
 import ast
+import gzip
 import os.path
 from abc import abstractmethod
 import itertools
@@ -41,7 +42,12 @@ def load_from_filename(cls, filename):
             return db
 
         # not a SQLite db?
-        with open(filename, newline="") as fp:
+        if filename.endswith('.gz'):
+            xopen = gzip.open
+        else:
+            xopen = open
+
+        with xopen(filename, 'rt', newline="") as fp:
             return cls.load_from_csv(fp)
 
     @classmethod
@@ -92,8 +98,9 @@ def load_from_sql(cls, filename):
     def write_to_filename(self, filename, *, database_format='csv',
                           ok_if_exists=False):
         if database_format == 'csv':
+            from .sourmash_args import FileOutputCSV
             if ok_if_exists or not os.path.exists(filename):
-                with open(filename, "w", newline="") as fp:
+                with FileOutputCSV(filename) as fp:
                     return self.write_to_csv(fp, write_header=True)
             elif os.path.exists(filename) and not ok_if_exists:
                 raise Exception("output manifest already exists")

diff --git a/src/sourmash/picklist.py b/src/sourmash/picklist.py
@@ -142,24 +142,20 @@ def init(self, values=[]):
 
     def load(self, pickfile, column_name, *, allow_empty=False):
         "load pickset, return num empty vals, and set of duplicate vals."
+        from . import sourmash_args
+
         pickset = self.init()
 
         if not os.path.exists(pickfile) or not os.path.isfile(pickfile):
             raise ValueError(f"pickfile '{pickfile}' must exist and be a regular file")
 
         n_empty_val = 0
         dup_vals = set()
-        with open(pickfile, newline='') as csvfile:
-            self.pickfile = pickfile
-            x = csvfile.readline()
 
-            # skip leading comment line in case there's a manifest header
-            if not x or x[0] == '#':
-                pass
-            else:
-                csvfile.seek(0)
-
-            r = csv.DictReader(csvfile)
+        # CTB: not clear to me what a good "default" name would be for a
+        # picklist CSV inside a zip (default_csv_name). Maybe manifest?
+        with sourmash_args.FileInputCSV(pickfile) as r:
+            self.pickfile = pickfile
             if not r.fieldnames:
                 if not allow_empty:
                     raise ValueError(f"empty or improperly formatted pickfile '{pickfile}'")

diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py
@@ -1366,9 +1366,8 @@ def check(args):
         # go through the input file and pick out missing rows.
         n_input = 0
         n_output = 0
-        with open(pickfile, newline='') as csvfp:
-            r = csv.DictReader(csvfp)
 
+        with sourmash_args.FileInputCSV(pickfile) as r:
             with open(args.output_missing, "w", newline='') as outfp:
                 w = csv.DictWriter(outfp, fieldnames=r.fieldnames)
                 w.writeheader()

diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py
@@ -37,11 +37,14 @@
 """
 import sys
 import os
+import csv
 from enum import Enum
 import traceback
 import gzip
-from io import StringIO
+from io import StringIO, TextIOWrapper
 import re
+import zipfile
+import contextlib
 
 import screed
 import sourmash
@@ -651,10 +654,122 @@ def __init__(self, filename):
     def open(self):
         if self.filename == '-' or self.filename is None:
             return sys.stdout
-        self.fp = open(self.filename, 'w', newline='')
+        if self.filename.endswith('.gz'):
+            self.fp = gzip.open(self.filename, 'wt', newline='')
+        else:
+            self.fp = open(self.filename, 'w', newline='')
         return self.fp
 
 
+class _DictReader_with_version:
+    """A version of csv.DictReader that allows a comment line with a version,
+    e.g.
+
+    # SOURMASH-MANIFEST-VERSION: 1.0
+
+    The version is stored as a 2-tuple in the 'version_info' attribute.
+    """
+    def __init__(self, textfp, *, delimiter=','):
+        self.version_info = []
+
+        # is there a '#' in the raw buffer pos 0?
+        ch = textfp.buffer.peek(1)
+
+        try:
+            ch = ch.decode('utf-8')
+        except UnicodeDecodeError:
+            raise csv.Error("unable to read CSV file")
+
+        # yes - read a line from the text buffer => parse
+        if ch.startswith('#'):
+            line = textfp.readline()
+            assert line.startswith('# '), line
+
+            # note, this can set version_info to lots of different things.
+            # revisit later, I guess. CTB.
+            self.version_info = line[2:].strip().split(': ', 2)
+
+        # build a DictReader from the remaining stream
+        self.reader = csv.DictReader(textfp, delimiter=delimiter)
+        self.fieldnames = self.reader.fieldnames
+
+    def __iter__(self):
+        for row in self.reader:
+            yield row
+
+
+@contextlib.contextmanager
+def FileInputCSV(filename, *, encoding='utf-8', default_csv_name=None,
+                 zipfile_obj=None, delimiter=','):
+    """A context manager for reading in CSV files in gzip, zip or text format.
+
+    Assumes comma delimiter, and uses csv.DictReader.
+
+    Note: does not support stdin.
+
+    Note: it seems surprisingly hard to write code that generically handles
+    any file handle being passed in; the manifest loading code, in particular,
+    uses ZipStorage.load => StringIO obj, which doesn't support peek etc.
+    So for now, this context manager is focused on situations where it owns
+    the file handle (opens/closes the file).
+    """
+    fp = None
+
+    if zipfile_obj and not default_csv_name:
+        raise ValueError("must provide default_csv_name with a zipfile_obj")
+
+    # first, try to load 'default_csv_name' from a zipfile:
+    if default_csv_name:
+        # were we given a zipfile obj?
+        if zipfile_obj:
+            try:
+                zi = zipfile_obj.getinfo(default_csv_name)
+                with zipfile_obj.open(zi) as fp:
+                    textfp = TextIOWrapper(fp,
+                                           encoding=encoding,
+                                           newline="")
+                    r = _DictReader_with_version(textfp, delimiter=delimiter)
+                    yield r
+            except (zipfile.BadZipFile, KeyError):
+                pass # uh oh, we were given a zipfile_obj and it FAILED.
+
+            # no matter what, if given zipfile_obj don't try .gz or regular csv
+            return
+        else:
+            try:
+                with zipfile.ZipFile(filename, 'r') as zip_fp:
+                    zi = zip_fp.getinfo(default_csv_name)
+                    with zip_fp.open(zi) as fp:
+                        textfp = TextIOWrapper(fp,
+                                               encoding=encoding,
+                                               newline="")
+                        r = _DictReader_with_version(textfp, delimiter=delimiter)
+                        yield r
+
+                # if we got this far with no exceptions, we found
+                # the CSV in the zip file. exit generator!
+                return
+            except (zipfile.BadZipFile, KeyError):
+                # no zipfile_obj => it's ok to continue onwards to .gz
+                # and regular CSV.
+                pass
+
+    # ok, not a zip file - try .gz:
+    try:
+        with gzip.open(filename, "rt", newline="", encoding=encoding) as fp:
+            fp.buffer.peek(1)          # force exception if not a gzip file
+            r = _DictReader_with_version(fp, delimiter=delimiter)
+            yield r
+        return
+    except gzip.BadGzipFile:
+        pass
+
+    # neither zip nor gz; regular file!
+    with open(filename, 'rt', newline="", encoding=encoding) as fp:
+        r = _DictReader_with_version(fp, delimiter=delimiter)
+        yield r
+
+
 class SignatureLoadingProgress:
     """A wrapper for signature loading progress reporting.
 

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
@@ -7,7 +7,7 @@
 from collections import abc
 import gzip
 
-from sourmash import sqlite_utils
+from sourmash import sqlite_utils, sourmash_args
 from sourmash.exceptions import IndexNotSupported
 from sourmash.distance_utils import containment_to_distance
 
@@ -642,17 +642,7 @@ def load(cls, filename, *, delimiter=',', force=False,
         if os.path.isdir(filename):
             raise ValueError(f"'{filename}' is a directory")
 
-        xopen = open
-        try:
-            with gzip.open(filename, 'r') as fp:
-                # succesful open/read? use gzip
-                fp.read(1)
-                xopen = gzip.open
-        except gzip.BadGzipFile:
-            pass
-
-        with xopen(filename, 'rt', newline='') as fp:
-            r = csv.DictReader(fp, delimiter=delimiter)
+        with sourmash_args.FileInputCSV(filename) as r:
             header = r.fieldnames
             if not header:
                 raise ValueError(f'cannot read taxonomy assignments from {filename}')
@@ -1042,7 +1032,7 @@ def load(cls, locations, **kwargs):
                 try:
                     this_tax_assign = LineageDB.load(location, **kwargs)
                     loaded = True
-                except ValueError as exc:
+                except (ValueError, csv.Error) as exc:
                     # for the last loader, just pass along ValueError...
                     raise ValueError(f"cannot read taxonomy assignments from '{location}': {str(exc)}")