Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

stats: fix --output delimiter inferencing based on file extension #2065

Merged
merged 4 commits into from
Aug 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 41 additions & 16 deletions src/cmd/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ Common options:
-n, --no-headers When set, the first row will NOT be interpreted
as column names. i.e., They will be included
in statistics.
-d, --delimiter <arg> The field delimiter for reading CSV data.
-d, --delimiter <arg> The field delimiter for READING CSV data.
Must be a single character. (default: ,)
--memcheck Check if there is enough memory to load the entire
CSV into memory using CONSERVATIVE heuristics.
Expand Down Expand Up @@ -255,7 +255,7 @@ use threadpool::ThreadPool;

use self::FieldType::{TDate, TDateTime, TFloat, TInteger, TNull, TString};
use crate::{
config::{Config, Delimiter},
config::{get_delim_by_extension, Config, Delimiter},
select::{SelectColumns, Selection},
util, CliResult,
};
Expand Down Expand Up @@ -368,6 +368,8 @@ pub enum JsonTypes {
String,
}

// we use this to serialize the StatsData data structure
// to a JSONL file using serde_json
pub static STATSDATA_TYPES_ARRAY: [JsonTypes; 34] = [
JsonTypes::String, //field
JsonTypes::String, //type
Expand Down Expand Up @@ -491,14 +493,37 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
} else {
NamedTempFile::new()?
};
let stats_csv_tempfile_fname = stats_csv_tempfile.path().to_str().unwrap().to_owned();

// find the delimiter to use based on the extension of the output file
let mut snappy = false;
let (output_extension, output_delim) = if let Some(ref output_path) = args.flag_output {
let output_path = Path::new(&output_path);
get_delim_by_extension(output_path, &mut snappy, b',')
} else {
(String::new(), b',')
};
// construct the output temp filename suffix
let stats_csv_tempfile_extension = match output_extension.as_str() {
"tsv" => ".tsv",
"tab" => ".tab",
"ssv" => ".ssv",
_ => ".csv",
};
let stats_csv_tempfile_fname = format!(
"{stem}{prime_ext}{snappy_ext}",
stem = stats_csv_tempfile.path().to_str().unwrap(),
prime_ext = stats_csv_tempfile_extension,
snappy_ext = if snappy { ".sz" } else { "" }
);

// we will write the stats to a temp file
let mut wtr = Config::new(&Some(stats_csv_tempfile_fname.clone())).writer()?;
let mut fconfig = args.rconfig();
let wconfig = Config::new(&Some(stats_csv_tempfile_fname.clone()))
.delimiter(Some(Delimiter(output_delim)));
let mut wtr = wconfig.writer()?;
let mut rconfig = args.rconfig();
let mut stdin_tempfile_path = None;

if fconfig.is_stdin() {
if rconfig.is_stdin() {
// read from stdin and write to a temp file
log::info!("Reading from stdin");
let mut stdin_file = NamedTempFile::new()?;
Expand All @@ -511,10 +536,10 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
.or(Err("Cannot keep temporary file".to_string()))?;
stdin_tempfile_path = Some(tempfile_path.clone());
args.arg_input = Some(tempfile_path.to_string_lossy().to_string());
fconfig.path = Some(tempfile_path);
rconfig.path = Some(tempfile_path);
} else {
// check if the input file exists
if let Some(path) = fconfig.path.clone() {
if let Some(path) = rconfig.path.clone() {
if !path.exists() {
return fail_clierror!("File {:?} does not exist", path.display());
}
Expand All @@ -527,7 +552,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {

let write_stats_jsonl = args.flag_stats_jsonl;

if let Some(path) = fconfig.path.clone() {
if let Some(path) = rconfig.path.clone() {
let path_file_stem = path.file_stem().unwrap().to_str().unwrap();
let stats_file = stats_path(&path, false)?;
// check if <FILESTEM>.stats.csv file already exists.
Expand Down Expand Up @@ -630,15 +655,15 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
// check if flag_cache_threshold is a negative number,
// if so, set the autoindex_size to absolute of the number
if args.flag_cache_threshold.is_negative() {
fconfig.autoindex_size = args.flag_cache_threshold.unsigned_abs() as u64;
rconfig.autoindex_size = args.flag_cache_threshold.unsigned_abs() as u64;
autoindex_set = true;
}

// we need to count the number of records in the file to calculate sparsity
let record_count = RECORD_COUNT.get_or_init(|| util::count_rows(&fconfig).unwrap());
let record_count = RECORD_COUNT.get_or_init(|| util::count_rows(&rconfig).unwrap());
log::info!("scanning {record_count} records...");

let (headers, stats) = match fconfig.indexed()? {
let (headers, stats) = match rconfig.indexed()? {
None => args.sequential_stats(&args.flag_dates_whitelist),
Some(idx) => {
let idx_count = idx.count();
Expand Down Expand Up @@ -701,15 +726,15 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
stats_csv_tempfile_fname
} else {
// we didn't compute the stats, re-use the existing stats file
stats_path(fconfig.path.as_ref().unwrap(), false)?
stats_path(rconfig.path.as_ref().unwrap(), false)?
.to_str()
.unwrap()
.to_owned()
};

if fconfig.is_stdin() {
if rconfig.is_stdin() {
// if we read from stdin, copy the temp stats file to "stdin.stats.csv"
let mut stats_pathbuf = stats_path(fconfig.path.as_ref().unwrap(), true)?;
let mut stats_pathbuf = stats_path(rconfig.path.as_ref().unwrap(), true)?;
fs::copy(currstats_filename.clone(), stats_pathbuf.clone())?;

// save the stats args to "stdin.stats.csv.json"
Expand All @@ -718,7 +743,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
stats_pathbuf,
serde_json::to_string_pretty(&current_stats_args).unwrap(),
)?;
} else if let Some(path) = fconfig.path {
} else if let Some(path) = rconfig.path {
// if we read from a file, copy the temp stats file to "<FILESTEM>.stats.csv"
let mut stats_pathbuf = path.clone();
stats_pathbuf.set_extension("stats.csv");
Expand Down
62 changes: 34 additions & 28 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -127,34 +127,9 @@ impl Config {
Some(ref s) if &**s == "-" => (None, default_delim, false),
Some(ref s) => {
let path = PathBuf::from(s);
let file_extension = path
.extension()
.unwrap_or_default()
.to_str()
.unwrap()
.to_ascii_lowercase();
let mut snappy = false;
let delim = if file_extension == "tsv" || file_extension == "tab" {
b'\t'
} else if file_extension == "ssv" {
b';'
} else if file_extension == "csv" {
b','
} else {
let filename = path.file_name().unwrap().to_str().unwrap();
if filename.ends_with(".csv.sz") {
snappy = true;
b','
} else if filename.ends_with(".tsv.sz") || filename.ends_with(".tab.sz") {
snappy = true;
b'\t'
} else if filename.ends_with(".ssv.sz") {
snappy = true;
b';'
} else {
default_delim
}
};
let mut snappy: bool = false;
let (file_extension, delim) =
get_delim_by_extension(&path, &mut snappy, default_delim);
(Some(path), delim, snappy || file_extension.ends_with("sz"))
},
};
Expand Down Expand Up @@ -579,3 +554,34 @@ impl Config {
.from_writer(wtr)
}
}

pub fn get_delim_by_extension(path: &Path, snappy: &mut bool, default_delim: u8) -> (String, u8) {
let file_extension = path
.extension()
.unwrap_or_default()
.to_str()
.unwrap()
.to_ascii_lowercase();
let delim = if file_extension == "tsv" || file_extension == "tab" {
b'\t'
} else if file_extension == "ssv" {
b';'
} else if file_extension == "csv" {
b','
} else {
let filename = path.file_name().unwrap().to_str().unwrap();
if filename.ends_with(".csv.sz") {
*snappy = true;
b','
} else if filename.ends_with(".tsv.sz") || filename.ends_with(".tab.sz") {
*snappy = true;
b'\t'
} else if filename.ends_with(".ssv.sz") {
*snappy = true;
b';'
} else {
default_delim
}
};
(file_extension, delim)
}
98 changes: 98 additions & 0 deletions tests/test_stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1226,6 +1226,104 @@ fn stats_zero_cv() {
assert_eq!(got, expected);
}

#[test]
fn stats_output_tab_delimited() {
let wrk = Workdir::new("stats_output_tab_delimited");

wrk.create(
"data.csv",
vec![
svec!["col1", "col2", "col3"],
svec!["1", "4321", "01"],
svec!["2", "3210", "02"],
svec!["3", "2101", "03"],
svec!["4", "1012", "04"],
svec!["5", "0", "10"],
],
);

let out_file = wrk.path("output.tab").to_string_lossy().to_string();

let mut cmd = wrk.command("stats");
cmd.arg("data.csv").args(["--output", &out_file]);

wrk.assert_success(&mut cmd);

let got = std::fs::read_to_string(out_file).unwrap();
let expected = r#"field type is_ascii sum min max range min_length max_length mean sem stddev variance cv nullcount max_precision sparsity
col1 Integer 15 1 5 4 1 1 3 0.6325 1.4142 2 47.1405 0 0
col2 Integer 10644 0 4321 4321 1 4 2128.8 685.6979 1533.267 2350907.76 72.0249 0 0
col3 String true 01 10 2 2 0 0
"#;
assert_eq!(got, expected);
}

#[test]
fn stats_output_ssv_delimited() {
let wrk = Workdir::new("stats_output_ssv_delimited");

wrk.create(
"data.csv",
vec![
svec!["col1", "col2", "col3"],
svec!["1", "4321", "01"],
svec!["2", "3210", "02"],
svec!["3", "2101", "03"],
svec!["4", "1012", "04"],
svec!["5", "0", "10"],
],
);

let out_file = wrk.path("output.ssv").to_string_lossy().to_string();

let mut cmd = wrk.command("stats");
cmd.arg("data.csv").args(["--output", &out_file]);

wrk.assert_success(&mut cmd);

let got = std::fs::read_to_string(out_file).unwrap();
let expected = r#"field;type;is_ascii;sum;min;max;range;min_length;max_length;mean;sem;stddev;variance;cv;nullcount;max_precision;sparsity
col1;Integer;;15;1;5;4;1;1;3;0.6325;1.4142;2;47.1405;0;;0
col2;Integer;;10644;0;4321;4321;1;4;2128.8;685.6979;1533.267;2350907.76;72.0249;0;;0
col3;String;true;;01;10;;2;2;;;;;;0;;0
"#;
assert_eq!(got, expected);
}

#[test]
fn stats_output_csvsz_delimited() {
let wrk = Workdir::new("stats_output_csvsz_delimited");

wrk.create(
"data.csv",
vec![
svec!["col1", "col2", "col3"],
svec!["1", "4321", "01"],
svec!["2", "3210", "02"],
svec!["3", "2101", "03"],
svec!["4", "1012", "04"],
svec!["5", "0", "10"],
],
);

let out_file = wrk.path("output.csv.sz").to_string_lossy().to_string();

let mut cmd = wrk.command("stats");
cmd.arg("data.csv").args(["--output", &out_file]);

wrk.assert_success(&mut cmd);

let mut cmd = wrk.command("snappy");
cmd.arg("decompress").arg(out_file.clone());

let got: String = wrk.stdout(&mut cmd);
let expected = "field,type,is_ascii,sum,min,max,range,min_length,max_length,mean,sem,stddev,\
variance,cv,nullcount,max_precision,sparsity\ncol1,Integer,,15,1,5,4,1,1,3,0.\
6325,1.4142,2,47.1405,0,,0\ncol2,Integer,,10644,0,4321,4321,1,4,2128.8,685.\
6979,1533.267,2350907.76,72.0249,0,,0\ncol3,String,true,,01,10,,2,2,,,,,,0,,0";
assert_eq!(got, expected);
}

mod stats_infer_nothing {
// Only test CSV data with headers.
// Empty CSV data with no headers won't produce any statistical analysis.
Expand Down
Loading