-
Notifications
You must be signed in to change notification settings - Fork 73
/
Copy pathslice.rs
272 lines (236 loc) · 8.95 KB
/
slice.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
static USAGE: &str = r#"
Returns the rows in the range specified (starting at 0, half-open interval).
The range does not include headers.
If the start of the range isn't specified, then the slice starts from the first
record in the CSV data.
If the end of the range isn't specified, then the slice continues to the last
record in the CSV data.
This operation can be made much faster by creating an index with 'qsv index'
first. With an index, the command requires parsing just the rows that are
sliced. Without an index, all rows up to the first row in the slice must be
parsed.
Usage:
qsv slice [options] [<input>]
qsv slice --help
slice options:
-s, --start <arg> The index of the record to slice from.
If negative, starts from the last record.
-e, --end <arg> The index of the record to slice to.
-l, --len <arg> The length of the slice (can be used instead
of --end).
-i, --index <arg> Slice a single record (shortcut for -s N -l 1).
If negative, starts from the last record.
--json Output the result as JSON. Fields are written
as key-value pairs. The key is the column name.
The value is the field value. The output is a
JSON array. If --no-headers is set, then
the keys are the column indices (zero-based).
--invert slice all records EXCEPT those in the specified range.
Examples:
# Slice from the 3rd record to the end
qsv slice --start 2 data.csv
# Slice the first three records
qsv slice --start 0 --end 2 data.csv
qsv slice --len 3 data.csv
qsv slice -l 3 data.csv
# Slice the last record
qsv slice -s -1 data.csv
# Slice the last 10 records
qsv slice -s -10 data.csv
# Get everything except the last 10 records
qsv slice -s -10 --invert data.csv
# Slice the first three records of the last 10 records
qsv slice -s -10 -l 3 data.csv
# Slice the second record
qsv slice --index 1 data.csv
qsv slice -i 1 data.csv
# Slice from the second record, two records
qsv slice -s 1 --len 2 data.csv
# Slice records 10 to 20 as JSON
qsv slice -s 9 -e 19 --json data.csv
qsv slice -s 9 -l 10 --json data.csv
# Slice records 1 to 9 and 21 to the end as JSON
qsv slice -s 9 -l 10 --invert --json data.csv
Common options:
-h, --help Display this message
-o, --output <file> Write output to <file> instead of stdout.
-n, --no-headers When set, the first row will not be interpreted
as headers. Otherwise, the first row will always
appear in the output as the header row.
-d, --delimiter <arg> The field delimiter for reading CSV data.
Must be a single character. (default: ,)
"#;
use std::{fs, path::PathBuf};
use serde::Deserialize;
use crate::{
config::{Config, Delimiter},
index::Indexed,
util, CliResult,
};
#[allow(clippy::unsafe_derive_deserialize)]
#[derive(Deserialize)]
struct Args {
arg_input: Option<String>,
flag_start: Option<isize>,
flag_end: Option<usize>,
flag_len: Option<usize>,
flag_index: Option<isize>,
flag_json: bool,
flag_output: Option<String>,
flag_no_headers: bool,
flag_delimiter: Option<Delimiter>,
flag_invert: bool,
}
pub fn run(argv: &[&str]) -> CliResult<()> {
let mut args: Args = util::get_args(USAGE, argv)?;
let tmpdir = tempfile::tempdir()?;
let work_input = util::process_input(
vec![PathBuf::from(
// if no input file is specified, read from stdin "-"
args.arg_input.clone().unwrap_or_else(|| "-".to_string()),
)],
&tmpdir,
"",
)?;
// safety: there's at least one valid element in work_input
let input_filename = work_input[0]
.canonicalize()?
.into_os_string()
.into_string()
.unwrap();
args.arg_input = Some(input_filename);
if let Some(idxed) = args.rconfig().indexed()? {
args.with_index(idxed)
} else {
args.no_index()
}
}
impl Args {
fn no_index(&self) -> CliResult<()> {
let mut rdr = self.rconfig().reader()?;
let (start, end) = self.range()?;
if self.flag_json {
let headers = rdr.byte_headers()?.clone();
let records = rdr.byte_records().enumerate().filter_map(move |(i, r)| {
let should_include = if self.flag_invert {
i < start || i >= end
} else {
i >= start && i < end
};
if should_include {
Some(r.unwrap())
} else {
None
}
});
util::write_json(
self.flag_output.as_ref(),
self.flag_no_headers,
&headers,
records,
)
} else {
let mut wtr = self.wconfig().writer()?;
self.rconfig().write_headers(&mut rdr, &mut wtr)?;
for (i, r) in rdr.byte_records().enumerate() {
if self.flag_invert == (i < start || i >= end) {
wtr.write_byte_record(&r?)?;
}
}
Ok(wtr.flush()?)
}
}
fn with_index(&self, mut indexed_file: Indexed<fs::File, fs::File>) -> CliResult<()> {
let (start, end) = self.range()?;
if end - start == 0 && !self.flag_invert {
return Ok(());
}
if self.flag_json {
let headers = indexed_file.byte_headers()?.clone();
let total_rows = util::count_rows(&self.rconfig())?;
let records = if self.flag_invert {
let mut records: Vec<csv::ByteRecord> =
Vec::with_capacity(start + (total_rows as usize - end));
// Get records before start
indexed_file.seek(0)?;
for r in indexed_file.byte_records().take(start) {
records.push(r.unwrap());
}
// Get records after end
indexed_file.seek(end as u64)?;
for r in indexed_file.byte_records().take(total_rows as usize - end) {
records.push(r.unwrap());
}
records
} else {
indexed_file.seek(start as u64)?;
indexed_file
.byte_records()
.take(end - start)
.map(|r| r.unwrap())
.collect::<Vec<_>>()
};
util::write_json(
self.flag_output.as_ref(),
self.flag_no_headers,
&headers,
records.into_iter(),
)
} else {
let mut wtr = self.wconfig().writer()?;
self.rconfig().write_headers(&mut *indexed_file, &mut wtr)?;
let total_rows = util::count_rows(&self.rconfig())? as usize;
if self.flag_invert {
// Get records before start
indexed_file.seek(0)?;
for r in indexed_file.byte_records().take(start) {
wtr.write_byte_record(&r?)?;
}
// Get records after end
indexed_file.seek(end as u64)?;
for r in indexed_file.byte_records().take(total_rows - end) {
wtr.write_byte_record(&r?)?;
}
} else {
indexed_file.seek(start as u64)?;
for r in indexed_file.byte_records().take(end - start) {
wtr.write_byte_record(&r?)?;
}
}
Ok(wtr.flush()?)
}
}
fn range(&self) -> CliResult<(usize, usize)> {
let mut start = None;
if let Some(start_arg) = self.flag_start {
if start_arg < 0 {
start = Some(
(util::count_rows(&self.rconfig())? as usize)
.abs_diff(start_arg.unsigned_abs()),
);
} else {
start = Some(start_arg as usize);
}
}
let index = if let Some(flag_index) = self.flag_index {
if flag_index < 0 {
let index = (util::count_rows(&self.rconfig())? as usize)
.abs_diff(flag_index.unsigned_abs());
Some(index)
} else {
Some(flag_index as usize)
}
} else {
None
};
Ok(util::range(start, self.flag_end, self.flag_len, index)?)
}
fn rconfig(&self) -> Config {
Config::new(self.arg_input.as_ref())
.delimiter(self.flag_delimiter)
.no_headers(self.flag_no_headers)
}
fn wconfig(&self) -> Config {
Config::new(self.flag_output.as_ref())
}
}