Skip to content

Commit

Permalink
Merge pull request #1044 from vuule/bug-ext-csv-page-aligned-byte-range
Browse files Browse the repository at this point in the history
[REVIEW] CSV Reader: Fix a segfault when byte range aligns with a page
  • Loading branch information
mjsamoht authored Feb 27, 2019
2 parents 65fdea3 + a8d2c32 commit 94fa0bc
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 20 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@
- PR #1019 Binary Ops: Fix error when one input column has null mask but other doesn't
- PR #1014 CSV Reader: Fix false positives in bool value detection
- PR #1034 CSV Reader: Fix parsing floating point precision and leading zero exponents
- PR #1044 CSV Reader: Fix a segfault when byte range aligns with a page


# cuDF 0.5.1 (05 Feb 2019)
Expand Down
26 changes: 6 additions & 20 deletions cpp/src/io/csv/csv_reader.cu
Original file line number Diff line number Diff line change
Expand Up @@ -432,10 +432,12 @@ gdf_error read_csv(csv_read_arg *args)
if (raw_csv->byte_range_size != 0 && padded_byte_range_size < map_size) {
// Need to make sure that w/ padding we don't overshoot the end of file
map_size = min(padded_byte_range_size + calculateMaxRowSize(args->num_cols), map_size);
// Ignore page padding for parsing purposes
raw_csv->num_bytes = map_size - page_padding;

}

// Ignore page padding for parsing purposes
raw_csv->num_bytes = map_size - page_padding;

map_data = mmap(0, map_size, PROT_READ, MAP_PRIVATE, fd, map_offset);

if (map_data == MAP_FAILED || map_size==0) { close(fd); checkError(GDF_C_ERROR, "Error mapping file"); }
Expand Down Expand Up @@ -1185,17 +1187,11 @@ __global__ void countRecords(char *data, const char terminator, const char quote
// process the data
cu_reccnt_t tokenCount = 0;
for (long x = 0; x < byteToProcess; x++) {

// Scan and log records. If quotations are enabled, then also log quotes
// for a postprocess ignore, as the chunk here has limited visibility.
if ((raw[x] == terminator) || (quotechar != '\0' && raw[x] == quotechar)) {
tokenCount++;
} else if (terminator == '\n' && (x + 1L) < byteToProcess &&
raw[x] == '\r' && raw[x + 1L] == '\n') {
x++;
tokenCount++;
}

}
atomicAdd(num_records, tokenCount);
}
Expand Down Expand Up @@ -1244,8 +1240,8 @@ gdf_error launch_storeRecordStart(const char *h_data, size_t h_size,
// include_first_row should only apply to the first chunk
const bool cu_include_first_row = (ci == 0) && (csvData->byte_range_offset == 0);

// Copy chunk to device. Copy extra byte if not last chunk
CUDA_TRY(cudaMemcpy(d_chunk, h_chunk, ci < (chunk_count - 1)?chunk_bytes:chunk_bytes + 1, cudaMemcpyDefault));
// Copy chunk to device
CUDA_TRY(cudaMemcpy(d_chunk, h_chunk, chunk_bytes, cudaMemcpyDefault));

const int gridSize = (chunk_bits + blockSize - 1) / blockSize;
storeRecordStart <<< gridSize, blockSize >>> (
Expand Down Expand Up @@ -1309,22 +1305,12 @@ __global__ void storeRecordStart(char *data, size_t chunk_offset,

// process the data
for (long x = 0; x < byteToProcess; x++) {

// Scan and log records. If quotations are enabled, then also log quotes
// for a postprocess ignore, as the chunk here has limited visibility.
if ((raw[x] == terminator) || (quotechar != '\0' && raw[x] == quotechar)) {

const auto pos = atomicAdd(num_records, 1ull);
recStart[pos] = did + chunk_offset + x + 1;

} else if (terminator == '\n' && (x + 1L) < byteToProcess &&
raw[x] == '\r' && raw[x + 1L] == '\n') {

x++;
const auto pos = atomicAdd(num_records, 1ull);
recStart[pos] = did + chunk_offset + x + 1;
}

}
}

Expand Down
13 changes: 13 additions & 0 deletions python/cudf/tests/test_csvreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -905,3 +905,16 @@ def test_csv_reader_bools_false_positives(tmpdir):
header=None, dtype=["int32"])

np.testing.assert_array_equal(items, df['0'])


def test_csv_reader_aligned_byte_range(tmpdir):
fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file19.csv")
nelem = 1000

input_df = pd.DataFrame({'key': np.arange(0, nelem),
'zeros': np.zeros(nelem)})
input_df.to_csv(fname)

df = cudf.read_csv(str(fname), byte_range=(0, 4096))
# read_csv call above used to crash; the assert below is not crucial
assert(np.count_nonzero(df['zeros']) == 0)

0 comments on commit 94fa0bc

Please sign in to comment.