Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] CSV Reader: Fix a segfault when byte range aligns with a page #1044

Merged
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@
- PR #1019 Binary Ops: Fix error when one input column has null mask but other doesn't
- PR #1014 CSV Reader: Fix false positives in bool value detection
- PR #1034 CSV Reader: Fix parsing floating point precision and leading zero exponents
- PR #1044 CSV Reader: Fix a segfault when byte range alings with a page
vuule marked this conversation as resolved.
Show resolved Hide resolved


# cuDF 0.5.1 (05 Feb 2019)
Expand Down
26 changes: 6 additions & 20 deletions cpp/src/io/csv/csv_reader.cu
Original file line number Diff line number Diff line change
Expand Up @@ -432,10 +432,12 @@ gdf_error read_csv(csv_read_arg *args)
if (raw_csv->byte_range_size != 0 && padded_byte_range_size < map_size) {
// Need to make sure that w/ padding we don't overshoot the end of file
map_size = min(padded_byte_range_size + calculateMaxRowSize(args->num_cols), map_size);
// Ignore page padding for parsing purposes
raw_csv->num_bytes = map_size - page_padding;

}

// Ignore page padding for parsing purposes
raw_csv->num_bytes = map_size - page_padding;

map_data = mmap(0, map_size, PROT_READ, MAP_PRIVATE, fd, map_offset);

if (map_data == MAP_FAILED || map_size==0) { close(fd); checkError(GDF_C_ERROR, "Error mapping file"); }
Expand Down Expand Up @@ -1185,17 +1187,11 @@ __global__ void countRecords(char *data, const char terminator, const char quote
// process the data
cu_reccnt_t tokenCount = 0;
for (long x = 0; x < byteToProcess; x++) {

// Scan and log records. If quotations are enabled, then also log quotes
// for a postprocess ignore, as the chunk here has limited visibility.
if ((raw[x] == terminator) || (quotechar != '\0' && raw[x] == quotechar)) {
tokenCount++;
} else if (terminator == '\n' && (x + 1L) < byteToProcess &&
raw[x] == '\r' && raw[x + 1L] == '\n') {
x++;
tokenCount++;
}

}
atomicAdd(num_records, tokenCount);
}
Expand Down Expand Up @@ -1244,8 +1240,8 @@ gdf_error launch_storeRecordStart(const char *h_data, size_t h_size,
// include_first_row should only apply to the first chunk
const bool cu_include_first_row = (ci == 0) && (csvData->byte_range_offset == 0);

// Copy chunk to device. Copy extra byte if not last chunk
CUDA_TRY(cudaMemcpy(d_chunk, h_chunk, ci < (chunk_count - 1)?chunk_bytes:chunk_bytes + 1, cudaMemcpyDefault));
// Copy chunk to device
CUDA_TRY(cudaMemcpy(d_chunk, h_chunk, chunk_bytes, cudaMemcpyDefault));

const int gridSize = (chunk_bits + blockSize - 1) / blockSize;
storeRecordStart <<< gridSize, blockSize >>> (
Expand Down Expand Up @@ -1309,22 +1305,12 @@ __global__ void storeRecordStart(char *data, size_t chunk_offset,

// process the data
for (long x = 0; x < byteToProcess; x++) {

// Scan and log records. If quotations are enabled, then also log quotes
// for a postprocess ignore, as the chunk here has limited visibility.
if ((raw[x] == terminator) || (quotechar != '\0' && raw[x] == quotechar)) {

const auto pos = atomicAdd(num_records, 1ull);
recStart[pos] = did + chunk_offset + x + 1;

} else if (terminator == '\n' && (x + 1L) < byteToProcess &&
raw[x] == '\r' && raw[x + 1L] == '\n') {

x++;
const auto pos = atomicAdd(num_records, 1ull);
recStart[pos] = did + chunk_offset + x + 1;
}

}
}

Expand Down
13 changes: 13 additions & 0 deletions python/cudf/tests/test_csvreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -905,3 +905,16 @@ def test_csv_reader_bools_false_positives(tmpdir):
header=None, dtype=["int32"])

np.testing.assert_array_equal(items, df['0'])


def test_csv_reader_aligned_byte_range(tmpdir):
fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file19.csv")
nelem = 1000

input_df = pd.DataFrame({'key': np.arange(0, nelem),
'zeros': np.zeros(nelem)})
input_df.to_csv(fname)

df = cudf.read_csv(str(fname), byte_range=(0, 4096))
# read_csv call above used to crash; the assert below is not crucial
assert(np.count_nonzero(df['zeros']) == 0)