Skip to content

Commit

Permalink
ARROW-17382: [C++] open_dataset doesn't ignore BOM in csv file when h…
Browse files Browse the repository at this point in the history
…eader's with quotes (apache#13838)

Lead-authored-by: Zimo Zhang <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
2 people authored and fatemehp committed Oct 17, 2022
1 parent e14b3ed commit a2cb504
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 5 deletions.
12 changes: 7 additions & 5 deletions cpp/src/arrow/dataset/file_csv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,12 @@ using RecordBatchGenerator = std::function<Future<std::shared_ptr<RecordBatch>>(
Result<std::unordered_set<std::string>> GetColumnNames(
const csv::ReadOptions& read_options, const csv::ParseOptions& parse_options,
util::string_view first_block, MemoryPool* pool) {
// Skip BOM when reading column names (ARROW-14644, ARROW-17382)
auto size = first_block.length();
const uint8_t* data = reinterpret_cast<const uint8_t*>(first_block.data());
ARROW_ASSIGN_OR_RAISE(auto data_no_bom, util::SkipUTF8BOM(data, size));
size = size - static_cast<uint32_t>(data_no_bom - data);
first_block = util::string_view(reinterpret_cast<const char*>(data_no_bom), size);
if (!read_options.column_names.empty()) {
std::unordered_set<std::string> column_names;
for (const auto& s : read_options.column_names) {
Expand Down Expand Up @@ -98,11 +104,7 @@ Result<std::unordered_set<std::string>> GetColumnNames(

RETURN_NOT_OK(
parser.VisitLastRow([&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
// Skip BOM when reading column names (ARROW-14644)
ARROW_ASSIGN_OR_RAISE(auto data_no_bom, util::SkipUTF8BOM(data, size));
size = size - static_cast<uint32_t>(data_no_bom - data);

util::string_view view{reinterpret_cast<const char*>(data_no_bom), size};
util::string_view view{reinterpret_cast<const char*>(data), size};
if (column_names.emplace(std::string(view)).second) {
return Status::OK();
}
Expand Down
18 changes: 18 additions & 0 deletions cpp/src/arrow/dataset/file_csv_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,24 @@ class TestCsvFileFormat : public FileFormatFixtureMixin<CsvFormatHelper>,
}
};

TEST_P(TestCsvFileFormat, BOMQuoteInHeader) {
// ARROW-17382: quoted headers after a BOM should be parsed correctly
auto source = GetFileSource("\xef\xbb\xbf\"ab\",\"cd\"\nef,gh\nij,kl\n");
auto fields = {field("ab", utf8()), field("cd", utf8())};
SetSchema(fields);
auto fragment = MakeFragment(*source);

int64_t row_count = 0;

for (auto maybe_batch : Batches(fragment.get())) {
ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch);
AssertSchemaEqual(batch->schema(), schema(fields));
row_count += batch->num_rows();
}

ASSERT_EQ(row_count, 2);
}

// Basic scanning tests (to exercise compression support); see the parameterized test
// below for more comprehensive testing of scan behaviors
TEST_P(TestCsvFileFormat, ScanRecordBatchReader) {
Expand Down

0 comments on commit a2cb504

Please sign in to comment.