Skip to content

Commit

Permalink
Add ReaderBuilder::with_header for csv reader (#4949)
Browse files Browse the repository at this point in the history
* Add ReaderBuilder::with_header

* Update test
  • Loading branch information
tustvold authored Oct 18, 2023
1 parent a94ccff commit 4964d84
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 25 deletions.
2 changes: 1 addition & 1 deletion arrow-csv/examples/csv_calculation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ fn main() {
Field::new("c4", DataType::Boolean, true),
]);
let mut reader = ReaderBuilder::new(Arc::new(csv_schema))
.has_header(true)
.with_header(true)
.build(file)
.unwrap();

Expand Down
48 changes: 28 additions & 20 deletions arrow-csv/src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ impl InferredDataType {
/// The format specification for the CSV file
#[derive(Debug, Clone, Default)]
pub struct Format {
has_header: bool,
header: bool,
delimiter: Option<u8>,
escape: Option<u8>,
quote: Option<u8>,
Expand All @@ -235,7 +235,7 @@ pub struct Format {

impl Format {
pub fn with_header(mut self, has_header: bool) -> Self {
self.has_header = has_header;
self.header = has_header;
self
}

Expand Down Expand Up @@ -280,7 +280,7 @@ impl Format {

// get or create header names
// when has_header is false, creates default column names with column_ prefix
let headers: Vec<String> = if self.has_header {
let headers: Vec<String> = if self.header {
let headers = &csv_reader.headers().map_err(map_csv_error)?.clone();
headers.iter().map(|s| s.to_string()).collect()
} else {
Expand Down Expand Up @@ -331,7 +331,7 @@ impl Format {
/// Build a [`csv::Reader`] for this [`Format`]
fn build_reader<R: Read>(&self, reader: R) -> csv::Reader<R> {
let mut builder = csv::ReaderBuilder::new();
builder.has_headers(self.has_header);
builder.has_headers(self.header);

if let Some(c) = self.delimiter {
builder.delimiter(c);
Expand Down Expand Up @@ -403,7 +403,7 @@ pub fn infer_reader_schema<R: Read>(
) -> Result<(Schema, usize), ArrowError> {
let format = Format {
delimiter: Some(delimiter),
has_header,
header: has_header,
..Default::default()
};
format.infer_schema(reader, max_read_records)
Expand All @@ -425,7 +425,7 @@ pub fn infer_schema_from_files(
let mut records_to_read = max_read_records.unwrap_or(usize::MAX);
let format = Format {
delimiter: Some(delimiter),
has_header,
header: has_header,
..Default::default()
};

Expand Down Expand Up @@ -1095,8 +1095,16 @@ impl ReaderBuilder {
}

/// Set whether the CSV file has headers
#[deprecated(note = "Use with_header")]
#[doc(hidden)]
pub fn has_header(mut self, has_header: bool) -> Self {
self.format.has_header = has_header;
self.format.header = has_header;
self
}

/// Set whether the CSV file has a header
pub fn with_header(mut self, has_header: bool) -> Self {
self.format.header = has_header;
self
}

Expand Down Expand Up @@ -1176,7 +1184,7 @@ impl ReaderBuilder {
let delimiter = self.format.build_parser();
let record_decoder = RecordDecoder::new(delimiter, self.schema.fields().len());

let header = self.format.has_header as usize;
let header = self.format.header as usize;

let (start, end) = match self.bounds {
Some((start, end)) => (start + header, end + header),
Expand Down Expand Up @@ -1317,7 +1325,7 @@ mod tests {
.chain(Cursor::new("\n".to_string()))
.chain(file_without_headers);
let mut csv = ReaderBuilder::new(Arc::new(schema))
.has_header(true)
.with_header(true)
.build(both_files)
.unwrap();
let batch = csv.next().unwrap().unwrap();
Expand All @@ -1335,7 +1343,7 @@ mod tests {
.unwrap();

file.rewind().unwrap();
let builder = ReaderBuilder::new(Arc::new(schema)).has_header(true);
let builder = ReaderBuilder::new(Arc::new(schema)).with_header(true);

let mut csv = builder.build(file).unwrap();
let expected_schema = Schema::new(vec![
Expand Down Expand Up @@ -1505,7 +1513,7 @@ mod tests {
let file = File::open("test/data/null_test.csv").unwrap();

let mut csv = ReaderBuilder::new(schema)
.has_header(true)
.with_header(true)
.build(file)
.unwrap();

Expand All @@ -1530,7 +1538,7 @@ mod tests {
let file = File::open("test/data/init_null_test.csv").unwrap();

let mut csv = ReaderBuilder::new(schema)
.has_header(true)
.with_header(true)
.build(file)
.unwrap();

Expand Down Expand Up @@ -1588,7 +1596,7 @@ mod tests {
let null_regex = Regex::new("^nil$").unwrap();

let mut csv = ReaderBuilder::new(schema)
.has_header(true)
.with_header(true)
.with_null_regex(null_regex)
.build(file)
.unwrap();
Expand Down Expand Up @@ -1710,7 +1718,7 @@ mod tests {
]);

let builder = ReaderBuilder::new(Arc::new(schema))
.has_header(true)
.with_header(true)
.with_delimiter(b'|')
.with_batch_size(512)
.with_projection(vec![0, 1, 2, 3]);
Expand Down Expand Up @@ -2037,7 +2045,7 @@ mod tests {
Field::new("text2", DataType::Utf8, false),
]);
let builder = ReaderBuilder::new(Arc::new(schema))
.has_header(false)
.with_header(false)
.with_quote(b'~'); // default is ", change to ~

let mut csv_text = Vec::new();
Expand Down Expand Up @@ -2069,7 +2077,7 @@ mod tests {
Field::new("text2", DataType::Utf8, false),
]);
let builder = ReaderBuilder::new(Arc::new(schema))
.has_header(false)
.with_header(false)
.with_escape(b'\\'); // default is None, change to \

let mut csv_text = Vec::new();
Expand Down Expand Up @@ -2101,7 +2109,7 @@ mod tests {
Field::new("text2", DataType::Utf8, false),
]);
let builder = ReaderBuilder::new(Arc::new(schema))
.has_header(false)
.with_header(false)
.with_terminator(b'\n'); // default is CRLF, change to LF

let mut csv_text = Vec::new();
Expand Down Expand Up @@ -2143,7 +2151,7 @@ mod tests {
]));

for (idx, (bounds, has_header, expected)) in tests.into_iter().enumerate() {
let mut reader = ReaderBuilder::new(schema.clone()).has_header(has_header);
let mut reader = ReaderBuilder::new(schema.clone()).with_header(has_header);
if let Some((start, end)) = bounds {
reader = reader.with_bounds(start, end);
}
Expand Down Expand Up @@ -2208,7 +2216,7 @@ mod tests {
for capacity in [1, 3, 7, 100] {
let reader = ReaderBuilder::new(schema.clone())
.with_batch_size(batch_size)
.has_header(has_header)
.with_header(has_header)
.build(File::open(path).unwrap())
.unwrap();

Expand All @@ -2226,7 +2234,7 @@ mod tests {

let reader = ReaderBuilder::new(schema.clone())
.with_batch_size(batch_size)
.has_header(has_header)
.with_header(has_header)
.build_buffered(buffered)
.unwrap();

Expand Down
2 changes: 1 addition & 1 deletion arrow/benches/csv_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ fn do_bench(c: &mut Criterion, name: &str, cols: Vec<ArrayRef>) {
let cursor = Cursor::new(buf.as_slice());
let reader = csv::ReaderBuilder::new(batch.schema())
.with_batch_size(batch_size)
.has_header(true)
.with_header(true)
.build_buffered(cursor)
.unwrap();

Expand Down
6 changes: 3 additions & 3 deletions parquet/src/bin/parquet-fromcsv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ fn configure_reader_builder(args: &Args, arrow_schema: Arc<Schema>) -> ReaderBui

let mut builder = ReaderBuilder::new(arrow_schema)
.with_batch_size(args.batch_size)
.has_header(args.has_header)
.with_header(args.has_header)
.with_delimiter(args.get_delimiter());

builder = configure_reader(
Expand Down Expand Up @@ -606,7 +606,7 @@ mod tests {

let reader_builder = configure_reader_builder(&args, arrow_schema);
let builder_debug = format!("{reader_builder:?}");
assert_debug_text(&builder_debug, "has_header", "false");
assert_debug_text(&builder_debug, "header", "false");
assert_debug_text(&builder_debug, "delimiter", "Some(44)");
assert_debug_text(&builder_debug, "quote", "Some(34)");
assert_debug_text(&builder_debug, "terminator", "None");
Expand Down Expand Up @@ -641,7 +641,7 @@ mod tests {
]));
let reader_builder = configure_reader_builder(&args, arrow_schema);
let builder_debug = format!("{reader_builder:?}");
assert_debug_text(&builder_debug, "has_header", "true");
assert_debug_text(&builder_debug, "header", "true");
assert_debug_text(&builder_debug, "delimiter", "Some(9)");
assert_debug_text(&builder_debug, "quote", "None");
assert_debug_text(&builder_debug, "terminator", "Some(10)");
Expand Down

0 comments on commit 4964d84

Please sign in to comment.