Skip to content

Commit

Permalink
Merge pull request #270 from jqnatividad/input_refactor
Browse files Browse the repository at this point in the history
`input` refactor
  • Loading branch information
jqnatividad authored Apr 23, 2022
2 parents 0f4bc0e + 0c5b931 commit 51607d8
Show file tree
Hide file tree
Showing 4 changed files with 241 additions and 14 deletions.
13 changes: 13 additions & 0 deletions resources/test/inputskiptest.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# test file to see how skiplines work"
! this is another comment before the header"
# DATA DICTIONARY"
! column1 - alphabetic; id of the column"
% column2 - numeric; just a number"
column1,column2
a,1
b,2
c,3
d,4
e,5
// this is an epilog line
# and another one
74 changes: 60 additions & 14 deletions src/cmd/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,34 @@ use crate::CliResult;
use serde::Deserialize;

static USAGE: &str = r#"
Read CSV data with special quoting and line-skipping rules.
Read CSV data with special quoting, trimming and line-skipping rules.
Generally, all qsv commands support basic options like specifying the delimiter
used in CSV data. This does not cover all possible types of CSV data. For
example, some CSV files don't use '"' for quotes or use different escaping
styles.
Also, CSVs that are technically malformed with preamble lines can be converted
into a format qsv can handle with the --skip-lines option.
Also, CSVs with preamble lines can be have the preamble skipped with the --skip-lines
option. Similarly, --skip-lastlines allows epilog lines to be skipped.
Usage:
qsv input [options] [<input>]
input options:
--quote <arg> The quote character to use. [default: "]
--escape <arg> The escape character to use. When not specified,
quotes are escaped by doubling them.
--no-quoting Disable quoting completely.
--skip-lines <arg> The number of lines to skip.
--quote <arg> The quote character to use. [default: "]
--escape <arg> The escape character to use. When not specified,
quotes are escaped by doubling them.
--no-quoting Disable quoting completely.
--skip-lines <arg> The number of preamble lines to skip.
--skip-lastlines <arg> The number of epilog lines to skip.
--trim-headers Trim leading & trailing whitespace from header values.
--trim-fields Trim leading & trailing whitespace from field values.
Common options:
-h, --help Display this message
-o, --output <file> Write output to <file> instead of stdout.
-d, --delimiter <arg> The field delimiter for reading CSV data.
Must be a single character. (default: ,)
-h, --help Display this message
-o, --output <file> Write output to <file> instead of stdout.
-d, --delimiter <arg> The field delimiter for reading CSV data.
Must be a single character. (default: ,)
"#;

#[derive(Deserialize)]
Expand All @@ -39,15 +42,27 @@ struct Args {
flag_quote: Delimiter,
flag_escape: Option<Delimiter>,
flag_no_quoting: bool,
flag_skip_lines: Option<usize>,
flag_skip_lines: Option<u64>,
flag_skip_lastlines: Option<u64>,
flag_trim_headers: bool,
flag_trim_fields: bool,
}

pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;

let trim_setting = match (args.flag_trim_headers, args.flag_trim_fields) {
(false, false) => csv::Trim::None,
(true, true) => csv::Trim::All,
(true, false) => csv::Trim::Headers,
(false, true) => csv::Trim::Fields,
};

let mut rconfig = Config::new(&args.arg_input)
.delimiter(args.flag_delimiter)
.no_headers(true)
.quote(args.flag_quote.as_byte());
.quote(args.flag_quote.as_byte())
.trim(trim_setting);
let wconfig = Config::new(&args.flag_output);

if let Some(escape) = args.flag_escape {
Expand All @@ -56,6 +71,19 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
if args.flag_no_quoting {
rconfig = rconfig.quoting(false);
}
if args.flag_skip_lines.is_some() || args.flag_skip_lastlines.is_some() {
rconfig = rconfig.flexible(true);
}

let mut total_lines = 0_u64;
if let Some(skip_llines) = args.flag_skip_lastlines {
let row_count = util::count_rows(&rconfig);
if skip_llines > row_count {
return fail!("--skip-lastlines: {skip_llines} is greater than row_count: {rowcount}.");
} else {
total_lines = row_count - skip_llines;
}
}

let mut rdr = rconfig.reader()?;
let mut wtr = wconfig.writer()?;
Expand All @@ -64,9 +92,27 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
for _i in 1..=skip_lines {
rdr.read_byte_record(&mut row)?;
}
if total_lines.saturating_sub(skip_lines) > 0 {
total_lines -= skip_lines;
}
}
// the first rdr record is the header, since
// we have no_headers = true, we manually trim the first record
if trim_setting == csv::Trim::Headers || trim_setting == csv::Trim::All {
rdr.read_byte_record(&mut row)?;
row.trim();
wtr.write_record(&row)?;
}

let mut i = 1_u64;
while rdr.read_byte_record(&mut row)? {
wtr.write_record(&row)?;
if total_lines > 0 {
i += 1;
if i > total_lines {
break;
}
}
}
wtr.flush()?;
Ok(())
Expand Down
8 changes: 8 additions & 0 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ pub struct Config {
double_quote: bool,
escape: Option<u8>,
quoting: bool,
trim: csv::Trim,
autoindex: bool,
checkutf8: bool,
}
Expand Down Expand Up @@ -143,6 +144,7 @@ impl Config {
double_quote: true,
escape: None,
quoting: true,
trim: csv::Trim::None,
autoindex: env::var("QSV_AUTOINDEX").is_ok(),
checkutf8: env::var("QSV_SKIPUTF8_CHECK").is_err(),
}
Expand Down Expand Up @@ -216,6 +218,11 @@ impl Config {
self
}

pub fn trim(mut self, trim_type: csv::Trim) -> Config {
self.trim = trim_type;
self
}

pub fn select(mut self, sel_cols: SelectColumns) -> Config {
self.select_columns = Some(sel_cols);
self
Expand Down Expand Up @@ -453,6 +460,7 @@ impl Config {
.escape(self.escape)
.buffer_capacity(rdr_buffer)
.comment(rdr_comment)
.trim(self.trim)
.from_reader(rdr)
}

Expand Down
160 changes: 160 additions & 0 deletions tests/test_comments.rs
Original file line number Diff line number Diff line change
Expand Up @@ -265,3 +265,163 @@ fn test_input_skip_no_line() {
];
assert_eq!(got, expected);
}

#[test]
fn test_input_trim_headers() {
let wrk = Workdir::new("input_trim_headers");
wrk.create(
"data.csv",
vec![
svec![" column1 ", " column2 "],
svec![" a", "1"],
svec!["c ", "3"],
svec!["e", "5 "],
],
);
let mut cmd = wrk.command("input");
cmd.arg("--trim-headers").arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["column1", "column2"],
svec![" a", "1"],
svec!["c ", "3"],
svec!["e", "5 "],
];
assert_eq!(got, expected);
}

#[test]
fn test_input_trim_fields() {
let wrk = Workdir::new("input_trim_fields");
wrk.create(
"data.csv",
vec![
svec!["column1 ", "column2 "],
svec![" a", " 1"],
svec!["c ", "3 "],
svec![" e ", " 5"],
],
);
let mut cmd = wrk.command("input");
cmd.arg("--trim-fields").arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["column1 ", "column2 "],
svec!["a", "1"],
svec!["c", "3"],
svec!["e", "5"],
];
assert_eq!(got, expected);
}

#[test]
fn test_input_trim_headers_fields() {
let wrk = Workdir::new("input_trim_headers_fields");
wrk.create(
"data.csv",
vec![
svec![" column1 ", " column2 "],
svec![" a", " 1"],
svec!["c ", "3 "],
svec![" e ", " 5"],
],
);
let mut cmd = wrk.command("input");
cmd.arg("--trim-headers")
.arg("--trim-fields")
.arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["column1", "column2"],
svec!["a", "1"],
svec!["c", "3"],
svec!["e", "5"],
];
assert_eq!(got, expected);
}

#[test]
fn test_input_skip_lastlines() {
let wrk = Workdir::new("input_skip_lastlines");
wrk.create(
"data.csv",
vec![
svec!["column1", "column2"],
svec!["a", "1"],
svec!["b", "2"],
svec!["c", "3"],
svec!["d", "4"],
svec!["e", "5"],
],
);
let mut cmd = wrk.command("input");
cmd.arg("--skip-lastlines").arg("2").arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["column1", "column2"],
svec!["a", "1"],
svec!["b", "2"],
svec!["c", "3"],
];
assert_eq!(got, expected);
}

#[test]
fn test_input_skip_lines_both() {
let wrk = Workdir::new("input_skip_lines_both");
wrk.create(
"data.csv",
vec![
svec!["#column1", "column2"],
svec!["! column1", "column2"],
svec!["column1", "column2"],
svec!["a", "1"],
svec!["b", "2"],
svec!["c", "3"],
svec!["d", "4"],
svec!["e", "5"],
],
);
let mut cmd = wrk.command("input");
cmd.arg("--skip-lastlines")
.arg("2")
.arg("--skip-lines")
.arg("2")
.arg("data.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["column1", "column2"],
svec!["a", "1"],
svec!["b", "2"],
svec!["c", "3"],
];
assert_eq!(got, expected);
}

#[test]
fn test_input_both_skip_flexible() {
let wrk = Workdir::new("test_input_both_skip_flexible");

let test_file = wrk.load_test_file("inputskiptest.csv");

let mut cmd = wrk.command("input");
cmd.arg("--skip-lastlines")
.arg("4")
.arg("--skip-lines")
.arg("5")
.arg(test_file);

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["column1", "column2"],
svec!["a", "1"],
svec!["b", "2"],
svec!["c", "3"],
];
assert_eq!(got, expected);
}

0 comments on commit 51607d8

Please sign in to comment.