diff --git a/CHANGELOG.md b/CHANGELOG.md index d9626becacf..5c6c3e7b985 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -95,6 +95,7 @@ - PR #1019 Binary Ops: Fix error when one input column has null mask but other doesn't - PR #1014 CSV Reader: Fix false positives in bool value detection - PR #1034 CSV Reader: Fix parsing floating point precision and leading zero exponents +- PR #1044 CSV Reader: Fix a segfault when byte range aligns with a page # cuDF 0.5.1 (05 Feb 2019) diff --git a/cpp/src/io/csv/csv_reader.cu b/cpp/src/io/csv/csv_reader.cu index 8e9a201cc17..4c128c4cd4a 100644 --- a/cpp/src/io/csv/csv_reader.cu +++ b/cpp/src/io/csv/csv_reader.cu @@ -432,10 +432,12 @@ gdf_error read_csv(csv_read_arg *args) if (raw_csv->byte_range_size != 0 && padded_byte_range_size < map_size) { // Need to make sure that w/ padding we don't overshoot the end of file map_size = min(padded_byte_range_size + calculateMaxRowSize(args->num_cols), map_size); - // Ignore page padding for parsing purposes - raw_csv->num_bytes = map_size - page_padding; + } + // Ignore page padding for parsing purposes + raw_csv->num_bytes = map_size - page_padding; + map_data = mmap(0, map_size, PROT_READ, MAP_PRIVATE, fd, map_offset); if (map_data == MAP_FAILED || map_size==0) { close(fd); checkError(GDF_C_ERROR, "Error mapping file"); } @@ -1185,17 +1187,11 @@ __global__ void countRecords(char *data, const char terminator, const char quote // process the data cu_reccnt_t tokenCount = 0; for (long x = 0; x < byteToProcess; x++) { - // Scan and log records. If quotations are enabled, then also log quotes // for a postprocess ignore, as the chunk here has limited visibility. if ((raw[x] == terminator) || (quotechar != '\0' && raw[x] == quotechar)) { tokenCount++; - } else if (terminator == '\n' && (x + 1L) < byteToProcess && - raw[x] == '\r' && raw[x + 1L] == '\n') { - x++; - tokenCount++; } - } atomicAdd(num_records, tokenCount); } @@ -1244,8 +1240,8 @@ gdf_error launch_storeRecordStart(const char *h_data, size_t h_size, // include_first_row should only apply to the first chunk const bool cu_include_first_row = (ci == 0) && (csvData->byte_range_offset == 0); - // Copy chunk to device. Copy extra byte if not last chunk - CUDA_TRY(cudaMemcpy(d_chunk, h_chunk, ci < (chunk_count - 1)?chunk_bytes:chunk_bytes + 1, cudaMemcpyDefault)); + // Copy chunk to device + CUDA_TRY(cudaMemcpy(d_chunk, h_chunk, chunk_bytes, cudaMemcpyDefault)); const int gridSize = (chunk_bits + blockSize - 1) / blockSize; storeRecordStart <<< gridSize, blockSize >>> ( @@ -1309,22 +1305,12 @@ __global__ void storeRecordStart(char *data, size_t chunk_offset, // process the data for (long x = 0; x < byteToProcess; x++) { - // Scan and log records. If quotations are enabled, then also log quotes // for a postprocess ignore, as the chunk here has limited visibility. if ((raw[x] == terminator) || (quotechar != '\0' && raw[x] == quotechar)) { - - const auto pos = atomicAdd(num_records, 1ull); - recStart[pos] = did + chunk_offset + x + 1; - - } else if (terminator == '\n' && (x + 1L) < byteToProcess && - raw[x] == '\r' && raw[x + 1L] == '\n') { - - x++; const auto pos = atomicAdd(num_records, 1ull); recStart[pos] = did + chunk_offset + x + 1; } - } } diff --git a/python/cudf/tests/test_csvreader.py b/python/cudf/tests/test_csvreader.py index ce9e6e2b3a5..4731935a7e4 100644 --- a/python/cudf/tests/test_csvreader.py +++ b/python/cudf/tests/test_csvreader.py @@ -905,3 +905,16 @@ def test_csv_reader_bools_false_positives(tmpdir): header=None, dtype=["int32"]) np.testing.assert_array_equal(items, df['0']) + + +def test_csv_reader_aligned_byte_range(tmpdir): + fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file19.csv") + nelem = 1000 + + input_df = pd.DataFrame({'key': np.arange(0, nelem), + 'zeros': np.zeros(nelem)}) + input_df.to_csv(fname) + + df = cudf.read_csv(str(fname), byte_range=(0, 4096)) + # read_csv call above used to crash; the assert below is not crucial + assert(np.count_nonzero(df['zeros']) == 0)