Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

input refactor #270

Merged
merged 4 commits into from
Apr 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions resources/test/inputskiptest.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# test file to see how skiplines work"
! this is another comment before the header"
# DATA DICTIONARY"
! column1 - alphabetic; id of the column"
% column2 - numeric; just a number"
column1,column2
a,1
b,2
c,3
d,4
e,5
// this is an epilog line
# and another one
74 changes: 60 additions & 14 deletions src/cmd/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,34 @@ use crate::CliResult;
use serde::Deserialize;

static USAGE: &str = r#"
Read CSV data with special quoting and line-skipping rules.
Read CSV data with special quoting, trimming and line-skipping rules.

Generally, all qsv commands support basic options like specifying the delimiter
used in CSV data. This does not cover all possible types of CSV data. For
example, some CSV files don't use '"' for quotes or use different escaping
styles.

Also, CSVs that are technically malformed with preamble lines can be converted
into a format qsv can handle with the --skip-lines option.
Also, CSVs with preamble lines can be have the preamble skipped with the --skip-lines
option. Similarly, --skip-lastlines allows epilog lines to be skipped.

Usage:
qsv input [options] [<input>]

input options:
--quote <arg> The quote character to use. [default: "]
--escape <arg> The escape character to use. When not specified,
quotes are escaped by doubling them.
--no-quoting Disable quoting completely.
--skip-lines <arg> The number of lines to skip.
--quote <arg> The quote character to use. [default: "]
--escape <arg> The escape character to use. When not specified,
quotes are escaped by doubling them.
--no-quoting Disable quoting completely.
--skip-lines <arg> The number of preamble lines to skip.
--skip-lastlines <arg> The number of epilog lines to skip.
--trim-headers Trim leading & trailing whitespace from header values.
--trim-fields Trim leading & trailing whitespace from field values.

Common options:
-h, --help Display this message
-o, --output <file> Write output to <file> instead of stdout.
-d, --delimiter <arg> The field delimiter for reading CSV data.
Must be a single character. (default: ,)
-h, --help Display this message
-o, --output <file> Write output to <file> instead of stdout.
-d, --delimiter <arg> The field delimiter for reading CSV data.
Must be a single character. (default: ,)
"#;

#[derive(Deserialize)]
Expand All @@ -39,15 +42,27 @@ struct Args {
flag_quote: Delimiter,
flag_escape: Option<Delimiter>,
flag_no_quoting: bool,
flag_skip_lines: Option<usize>,
flag_skip_lines: Option<u64>,
flag_skip_lastlines: Option<u64>,
flag_trim_headers: bool,
flag_trim_fields: bool,
}

pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;

let trim_setting = match (args.flag_trim_headers, args.flag_trim_fields) {
(false, false) => csv::Trim::None,
(true, true) => csv::Trim::All,
(true, false) => csv::Trim::Headers,
(false, true) => csv::Trim::Fields,
};

let mut rconfig = Config::new(&args.arg_input)
.delimiter(args.flag_delimiter)
.no_headers(true)
.quote(args.flag_quote.as_byte());
.quote(args.flag_quote.as_byte())
.trim(trim_setting);
let wconfig = Config::new(&args.flag_output);

if let Some(escape) = args.flag_escape {
Expand All @@ -56,6 +71,19 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
if args.flag_no_quoting {
rconfig = rconfig.quoting(false);
}
if args.flag_skip_lines.is_some() || args.flag_skip_lastlines.is_some() {
rconfig = rconfig.flexible(true);
}

let mut total_lines = 0_u64;
if let Some(skip_llines) = args.flag_skip_lastlines {
let row_count = util::count_rows(&rconfig);
if skip_llines > row_count {
return fail!("--skip-lastlines: {skip_llines} is greater than row_count: {rowcount}.");
} else {
total_lines = row_count - skip_llines;
}
}

let mut rdr = rconfig.reader()?;
let mut wtr = wconfig.writer()?;
Expand All @@ -64,9 +92,27 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
for _i in 1..=skip_lines {
rdr.read_byte_record(&mut row)?;
}
if total_lines.saturating_sub(skip_lines) > 0 {
total_lines -= skip_lines;
}
}
// the first rdr record is the header, since
// we have no_headers = true, we manually trim the first record
if trim_setting == csv::Trim::Headers || trim_setting == csv::Trim::All {
rdr.read_byte_record(&mut row)?;
row.trim();
wtr.write_record(&row)?;
}

let mut i = 1_u64;
while rdr.read_byte_record(&mut row)? {
wtr.write_record(&row)?;
if total_lines > 0 {
i += 1;
if i > total_lines {
break;
}
}
}
wtr.flush()?;
Ok(())
Expand Down
8 changes: 8 additions & 0 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ pub struct Config {
double_quote: bool,
escape: Option<u8>,
quoting: bool,
trim: csv::Trim,
autoindex: bool,
checkutf8: bool,
}
Expand Down Expand Up @@ -143,6 +144,7 @@ impl Config {
double_quote: true,
escape: None,
quoting: true,
trim: csv::Trim::None,
autoindex: env::var("QSV_AUTOINDEX").is_ok(),
checkutf8: env::var("QSV_SKIPUTF8_CHECK").is_err(),
}
Expand Down Expand Up @@ -216,6 +218,11 @@ impl Config {
self
}

pub fn trim(mut self, trim_type: csv::Trim) -> Config {
self.trim = trim_type;
self
}

pub fn select(mut self, sel_cols: SelectColumns) -> Config {
self.select_columns = Some(sel_cols);
self
Expand Down Expand Up @@ -453,6 +460,7 @@ impl Config {
.escape(self.escape)
.buffer_capacity(rdr_buffer)
.comment(rdr_comment)
.trim(self.trim)
.from_reader(rdr)
}

Expand Down
160 changes: 160 additions & 0 deletions tests/test_comments.rs
Original file line number Diff line number Diff line change
Expand Up @@ -265,3 +265,163 @@ fn test_input_skip_no_line() {
];
assert_eq!(got, expected);
}

#[test]
fn test_input_trim_headers() {
let wrk = Workdir::new("input_trim_headers");
wrk.create(
"data.csv",
vec![
svec![" column1 ", " column2 "],
svec![" a", "1"],
svec!["c ", "3"],
svec!["e", "5 "],
],
);
let mut cmd = wrk.command("input");
cmd.arg("--trim-headers").arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["column1", "column2"],
svec![" a", "1"],
svec!["c ", "3"],
svec!["e", "5 "],
];
assert_eq!(got, expected);
}

#[test]
fn test_input_trim_fields() {
let wrk = Workdir::new("input_trim_fields");
wrk.create(
"data.csv",
vec![
svec!["column1 ", "column2 "],
svec![" a", " 1"],
svec!["c ", "3 "],
svec![" e ", " 5"],
],
);
let mut cmd = wrk.command("input");
cmd.arg("--trim-fields").arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["column1 ", "column2 "],
svec!["a", "1"],
svec!["c", "3"],
svec!["e", "5"],
];
assert_eq!(got, expected);
}

#[test]
fn test_input_trim_headers_fields() {
let wrk = Workdir::new("input_trim_headers_fields");
wrk.create(
"data.csv",
vec![
svec![" column1 ", " column2 "],
svec![" a", " 1"],
svec!["c ", "3 "],
svec![" e ", " 5"],
],
);
let mut cmd = wrk.command("input");
cmd.arg("--trim-headers")
.arg("--trim-fields")
.arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["column1", "column2"],
svec!["a", "1"],
svec!["c", "3"],
svec!["e", "5"],
];
assert_eq!(got, expected);
}

#[test]
fn test_input_skip_lastlines() {
let wrk = Workdir::new("input_skip_lastlines");
wrk.create(
"data.csv",
vec![
svec!["column1", "column2"],
svec!["a", "1"],
svec!["b", "2"],
svec!["c", "3"],
svec!["d", "4"],
svec!["e", "5"],
],
);
let mut cmd = wrk.command("input");
cmd.arg("--skip-lastlines").arg("2").arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["column1", "column2"],
svec!["a", "1"],
svec!["b", "2"],
svec!["c", "3"],
];
assert_eq!(got, expected);
}

#[test]
fn test_input_skip_lines_both() {
let wrk = Workdir::new("input_skip_lines_both");
wrk.create(
"data.csv",
vec![
svec!["#column1", "column2"],
svec!["! column1", "column2"],
svec!["column1", "column2"],
svec!["a", "1"],
svec!["b", "2"],
svec!["c", "3"],
svec!["d", "4"],
svec!["e", "5"],
],
);
let mut cmd = wrk.command("input");
cmd.arg("--skip-lastlines")
.arg("2")
.arg("--skip-lines")
.arg("2")
.arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["column1", "column2"],
svec!["a", "1"],
svec!["b", "2"],
svec!["c", "3"],
];
assert_eq!(got, expected);
}

#[test]
fn test_input_both_skip_flexible() {
let wrk = Workdir::new("test_input_both_skip_flexible");

let test_file = wrk.load_test_file("inputskiptest.csv");

let mut cmd = wrk.command("input");
cmd.arg("--skip-lastlines")
.arg("4")
.arg("--skip-lines")
.arg("5")
.arg(test_file);

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["column1", "column2"],
svec!["a", "1"],
svec!["b", "2"],
svec!["c", "3"],
];
assert_eq!(got, expected);
}