Skip to content

Commit

Permalink
[MRG] add generic support for gzipped and zipfile CSVs (#2195)
Browse files Browse the repository at this point in the history
* add FileInputCSV

* support zipfile taxonommies

* rename 'default_zip_name' to 'default_csvname'

* hack in support for CSVs with version info

* more tests, support for ZipFile objs being passed in

* sourmash sketch now accepts gzipped CSV files

* picklists can now be gzipped

* sig check now supports gz input file

* add test for sig check with gz picklist

* add delimiter support

* update comment

* fix comment

* update comment

* update comment

* remove SOURMASH-TAXONOMY.csv

* add explicit tests for FileOutputCSV

* add gzip output tests for compare, plot, gather

* change prefetch to use FileOutputCSV

* fix gather --save-prefetch-csv to use FileOutputCSV

* fix manifests to read/write gzip

* add test for sig describe -o csv.gz

* more test
  • Loading branch information
ctb authored Aug 15, 2022
1 parent 010718c commit 0c093c1
Show file tree
Hide file tree
Showing 15 changed files with 731 additions and 41 deletions.
4 changes: 1 addition & 3 deletions src/sourmash/command_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,9 +426,7 @@ def fromfile(args):
n_duplicate_name = 0

for csvfile in args.csvs:
with open(csvfile, newline="") as fp:
r = csv.DictReader(fp)

with sourmash_args.FileInputCSV(csvfile) as r:
for row in r:
name = row['name']
if not name:
Expand Down
4 changes: 2 additions & 2 deletions src/sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -723,7 +723,7 @@ def gather(args):
prefetch_csvout_fp = None
prefetch_csvout_w = None
if args.save_prefetch_csv:
prefetch_csvout_fp = FileOutput(args.save_prefetch_csv, 'wt').open()
prefetch_csvout_fp = FileOutputCSV(args.save_prefetch_csv).open()

query_mh = prefetch_query.minhash
scaled = query_mh.scaled
Expand Down Expand Up @@ -1229,7 +1229,7 @@ def prefetch(args):
csvout_fp = None
csvout_w = None
if args.output:
csvout_fp = FileOutput(args.output, 'wt').open()
csvout_fp = FileOutputCSV(args.output).open()

# track & maybe save matches progressively
matches_out = SaveSignaturesToLocation(args.save_matches)
Expand Down
2 changes: 2 additions & 0 deletions src/sourmash/lca/command_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ def load_taxonomy_assignments(filename, *, delimiter=',', start_column=2,
lineage tuples.
"""
# parse spreadsheet!
# CTB note: can't easily switch to FileInputCSV, because of
# janky way we do/don't handle headers here. See issue #2198.
fp = open(filename, newline='')
r = csv.reader(fp, delimiter=delimiter)
row_headers = ['identifiers']
Expand Down
11 changes: 9 additions & 2 deletions src/sourmash/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
import csv
import ast
import gzip
import os.path
from abc import abstractmethod
import itertools
Expand Down Expand Up @@ -41,7 +42,12 @@ def load_from_filename(cls, filename):
return db

# not a SQLite db?
with open(filename, newline="") as fp:
if filename.endswith('.gz'):
xopen = gzip.open
else:
xopen = open

with xopen(filename, 'rt', newline="") as fp:
return cls.load_from_csv(fp)

@classmethod
Expand Down Expand Up @@ -92,8 +98,9 @@ def load_from_sql(cls, filename):
def write_to_filename(self, filename, *, database_format='csv',
ok_if_exists=False):
if database_format == 'csv':
from .sourmash_args import FileOutputCSV
if ok_if_exists or not os.path.exists(filename):
with open(filename, "w", newline="") as fp:
with FileOutputCSV(filename) as fp:
return self.write_to_csv(fp, write_header=True)
elif os.path.exists(filename) and not ok_if_exists:
raise Exception("output manifest already exists")
Expand Down
16 changes: 6 additions & 10 deletions src/sourmash/picklist.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,24 +142,20 @@ def init(self, values=[]):

def load(self, pickfile, column_name, *, allow_empty=False):
"load pickset, return num empty vals, and set of duplicate vals."
from . import sourmash_args

pickset = self.init()

if not os.path.exists(pickfile) or not os.path.isfile(pickfile):
raise ValueError(f"pickfile '{pickfile}' must exist and be a regular file")

n_empty_val = 0
dup_vals = set()
with open(pickfile, newline='') as csvfile:
self.pickfile = pickfile
x = csvfile.readline()

# skip leading comment line in case there's a manifest header
if not x or x[0] == '#':
pass
else:
csvfile.seek(0)

r = csv.DictReader(csvfile)
# CTB: not clear to me what a good "default" name would be for a
# picklist CSV inside a zip (default_csv_name). Maybe manifest?
with sourmash_args.FileInputCSV(pickfile) as r:
self.pickfile = pickfile
if not r.fieldnames:
if not allow_empty:
raise ValueError(f"empty or improperly formatted pickfile '{pickfile}'")
Expand Down
3 changes: 1 addition & 2 deletions src/sourmash/sig/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1366,9 +1366,8 @@ def check(args):
# go through the input file and pick out missing rows.
n_input = 0
n_output = 0
with open(pickfile, newline='') as csvfp:
r = csv.DictReader(csvfp)

with sourmash_args.FileInputCSV(pickfile) as r:
with open(args.output_missing, "w", newline='') as outfp:
w = csv.DictWriter(outfp, fieldnames=r.fieldnames)
w.writeheader()
Expand Down
119 changes: 117 additions & 2 deletions src/sourmash/sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,14 @@
"""
import sys
import os
import csv
from enum import Enum
import traceback
import gzip
from io import StringIO
from io import StringIO, TextIOWrapper
import re
import zipfile
import contextlib

import screed
import sourmash
Expand Down Expand Up @@ -651,10 +654,122 @@ def __init__(self, filename):
def open(self):
if self.filename == '-' or self.filename is None:
return sys.stdout
self.fp = open(self.filename, 'w', newline='')
if self.filename.endswith('.gz'):
self.fp = gzip.open(self.filename, 'wt', newline='')
else:
self.fp = open(self.filename, 'w', newline='')
return self.fp


class _DictReader_with_version:
"""A version of csv.DictReader that allows a comment line with a version,
e.g.
# SOURMASH-MANIFEST-VERSION: 1.0
The version is stored as a 2-tuple in the 'version_info' attribute.
"""
def __init__(self, textfp, *, delimiter=','):
self.version_info = []

# is there a '#' in the raw buffer pos 0?
ch = textfp.buffer.peek(1)

try:
ch = ch.decode('utf-8')
except UnicodeDecodeError:
raise csv.Error("unable to read CSV file")

# yes - read a line from the text buffer => parse
if ch.startswith('#'):
line = textfp.readline()
assert line.startswith('# '), line

# note, this can set version_info to lots of different things.
# revisit later, I guess. CTB.
self.version_info = line[2:].strip().split(': ', 2)

# build a DictReader from the remaining stream
self.reader = csv.DictReader(textfp, delimiter=delimiter)
self.fieldnames = self.reader.fieldnames

def __iter__(self):
for row in self.reader:
yield row


@contextlib.contextmanager
def FileInputCSV(filename, *, encoding='utf-8', default_csv_name=None,
zipfile_obj=None, delimiter=','):
"""A context manager for reading in CSV files in gzip, zip or text format.
Assumes comma delimiter, and uses csv.DictReader.
Note: does not support stdin.
Note: it seems surprisingly hard to write code that generically handles
any file handle being passed in; the manifest loading code, in particular,
uses ZipStorage.load => StringIO obj, which doesn't support peek etc.
So for now, this context manager is focused on situations where it owns
the file handle (opens/closes the file).
"""
fp = None

if zipfile_obj and not default_csv_name:
raise ValueError("must provide default_csv_name with a zipfile_obj")

# first, try to load 'default_csv_name' from a zipfile:
if default_csv_name:
# were we given a zipfile obj?
if zipfile_obj:
try:
zi = zipfile_obj.getinfo(default_csv_name)
with zipfile_obj.open(zi) as fp:
textfp = TextIOWrapper(fp,
encoding=encoding,
newline="")
r = _DictReader_with_version(textfp, delimiter=delimiter)
yield r
except (zipfile.BadZipFile, KeyError):
pass # uh oh, we were given a zipfile_obj and it FAILED.

# no matter what, if given zipfile_obj don't try .gz or regular csv
return
else:
try:
with zipfile.ZipFile(filename, 'r') as zip_fp:
zi = zip_fp.getinfo(default_csv_name)
with zip_fp.open(zi) as fp:
textfp = TextIOWrapper(fp,
encoding=encoding,
newline="")
r = _DictReader_with_version(textfp, delimiter=delimiter)
yield r

# if we got this far with no exceptions, we found
# the CSV in the zip file. exit generator!
return
except (zipfile.BadZipFile, KeyError):
# no zipfile_obj => it's ok to continue onwards to .gz
# and regular CSV.
pass

# ok, not a zip file - try .gz:
try:
with gzip.open(filename, "rt", newline="", encoding=encoding) as fp:
fp.buffer.peek(1) # force exception if not a gzip file
r = _DictReader_with_version(fp, delimiter=delimiter)
yield r
return
except gzip.BadGzipFile:
pass

# neither zip nor gz; regular file!
with open(filename, 'rt', newline="", encoding=encoding) as fp:
r = _DictReader_with_version(fp, delimiter=delimiter)
yield r


class SignatureLoadingProgress:
"""A wrapper for signature loading progress reporting.
Expand Down
16 changes: 3 additions & 13 deletions src/sourmash/tax/tax_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from collections import abc
import gzip

from sourmash import sqlite_utils
from sourmash import sqlite_utils, sourmash_args
from sourmash.exceptions import IndexNotSupported
from sourmash.distance_utils import containment_to_distance

Expand Down Expand Up @@ -642,17 +642,7 @@ def load(cls, filename, *, delimiter=',', force=False,
if os.path.isdir(filename):
raise ValueError(f"'{filename}' is a directory")

xopen = open
try:
with gzip.open(filename, 'r') as fp:
# succesful open/read? use gzip
fp.read(1)
xopen = gzip.open
except gzip.BadGzipFile:
pass

with xopen(filename, 'rt', newline='') as fp:
r = csv.DictReader(fp, delimiter=delimiter)
with sourmash_args.FileInputCSV(filename) as r:
header = r.fieldnames
if not header:
raise ValueError(f'cannot read taxonomy assignments from {filename}')
Expand Down Expand Up @@ -1042,7 +1032,7 @@ def load(cls, locations, **kwargs):
try:
this_tax_assign = LineageDB.load(location, **kwargs)
loaded = True
except ValueError as exc:
except (ValueError, csv.Error) as exc:
# for the last loader, just pass along ValueError...
raise ValueError(f"cannot read taxonomy assignments from '{location}': {str(exc)}")

Expand Down
Loading

0 comments on commit 0c093c1

Please sign in to comment.