diff --git a/CHANGELOG.md b/CHANGELOG.md index 44a79bef240..91167de496a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ - PR #2012 Add `reindex()` to DataFrame and Series - PR #2098 Align DataFrame and Series indices before executing binary ops +- PR #2149 CSV Reader: Add `hex` dtype for explicit hexadecimal parsing ## Improvements diff --git a/cpp/src/io/csv/csv_reader_impl.cu b/cpp/src/io/csv/csv_reader_impl.cu index 68914d15c5a..3253d0022d8 100644 --- a/cpp/src/io/csv/csv_reader_impl.cu +++ b/cpp/src/io/csv/csv_reader_impl.cu @@ -76,16 +76,16 @@ namespace csv { using string_pair = std::pair; - __global__ void convertCsvToGdf(char *csv, const ParseOptions opts, gdf_size_type num_records, int num_columns, - bool *parseCol, uint64_t *recStart, - gdf_dtype *dtype, void **gdf_data, + column_parse::flags *flags, uint64_t *recStart, + gdf_dtype *dtype, void **data, gdf_valid_type **valid, gdf_size_type *num_valid); __global__ void dataTypeDetection(char *raw_csv, const ParseOptions opts, gdf_size_type num_records, int num_columns, - bool *parseCol, uint64_t *recStart, + column_parse::flags *flags, + uint64_t *recStart, column_data_t *d_columnData); /**---------------------------------------------------------------------------* @@ -112,6 +112,29 @@ __global__ void dataTypeDetection(char *raw_csv, const ParseOptions opts, return base_padding + num_columns * column_bytes; } } + +/** + * @brief Translates a dtype string and returns its dtype enumeration and any + * extended dtype flags that are supported by cuIO. Often, this is a column + * with the same underlying dtype the basic types, but with different parsing + * interpretations. + * + * @param[in] dtype String containing the basic or extended dtype + * + * @return std::pair Tuple of dtype and flags + */ +std::pair get_dtype_info( + const std::string &dtype) { + if (dtype == "hex" || dtype == "hex64") { + return std::make_pair(GDF_INT64, column_parse::as_hexadecimal); + } + if (dtype == "hex32") { + return std::make_pair(GDF_INT32, column_parse::as_hexadecimal); + } + + return std::make_pair(convertStringToDtype(dtype), column_parse::as_default); +} + /** * @brief Removes the first and Last quote in the string */ @@ -476,68 +499,64 @@ table reader::Impl::read() //----------------------------------------------------------------------------- //-- Populate the header - // Check if the user gave us a list of column names - if(args_.names.empty()) { - setColumnNamesFromCsv(); + // Check if the user gave us a list of column names + if (not args_.names.empty()) { + h_column_flags.resize(args_.names.size(), column_parse::enabled); + col_names = args_.names; + } else { + setColumnNamesFromCsv(); - num_actual_cols = num_active_cols = col_names.size(); + num_actual_cols = num_active_cols = col_names.size(); - // Initialize a boolean array that states if a column needs to read or filtered. - h_parseCol = thrust::host_vector(num_actual_cols, true); + h_column_flags.resize(num_actual_cols, column_parse::enabled); - // Rename empty column names to "Unnamed: col_index" - for (size_t col_idx = 0; col_idx < col_names.size(); ++col_idx) { - if (col_names[col_idx].empty()) { - col_names[col_idx] = string("Unnamed: ") + std::to_string(col_idx); - } - } + // Rename empty column names to "Unnamed: col_index" + for (size_t col_idx = 0; col_idx < col_names.size(); ++col_idx) { + if (col_names[col_idx].empty()) { + col_names[col_idx] = string("Unnamed: ") + std::to_string(col_idx); + } + } - // Looking for duplicates - std::unordered_map col_names_histogram; - for (auto& col_name: col_names){ - // Operator [] inserts a default-initialized value if the given key is not present - if (++col_names_histogram[col_name] > 1){ - if (args_.mangle_dupe_cols) { - // Rename duplicates of column X as X.1, X.2, ...; First appearance stays as X - col_name += "." + std::to_string(col_names_histogram[col_name] - 1); - } - else { - // All duplicate columns will be ignored; First appearance is parsed - const auto idx = &col_name - col_names.data(); - h_parseCol[idx] = false; - } - } - } + // Looking for duplicates + std::unordered_map col_names_histogram; + for (auto& col_name: col_names){ + // Operator [] inserts a default-initialized value if the given key is not present + if (++col_names_histogram[col_name] > 1){ + if (args_.mangle_dupe_cols) { + // Rename duplicates of column X as X.1, X.2, ...; First appearance stays as X + col_name += "." + std::to_string(col_names_histogram[col_name] - 1); + } + else { + // All duplicate columns will be ignored; First appearance is parsed + const auto idx = &col_name - col_names.data(); + h_column_flags[idx] = column_parse::disabled; + } + } + } - // Update the number of columns to be processed, if some might have been removed - if (!args_.mangle_dupe_cols) { - num_active_cols = col_names_histogram.size(); - } - } - else { - h_parseCol = thrust::host_vector(args_.names.size(), true); - col_names = args_.names; - } + // Update the number of columns to be processed, if some might have been removed + if (!args_.mangle_dupe_cols) { + num_active_cols = col_names_histogram.size(); + } + } - // User can specify which columns should be parsed - if (!args_.use_cols_indexes.empty() || !args_.use_cols_names.empty()){ - thrust::fill(h_parseCol.begin(), h_parseCol.end(), false); - for(int col: args_.use_cols_indexes){ - h_parseCol[col]=true; - } - num_active_cols = args_.use_cols_indexes.size(); - - std::set use_cols_set(args_.use_cols_names.begin(), args_.use_cols_names.end()); - for(const std::string &col: col_names) { - if(use_cols_set.find(col) != use_cols_set.end()){ - const auto pos = &col - col_names.data(); - h_parseCol[pos] = true; - num_active_cols++; - } - } - } + // User can specify which columns should be parsed + if (not args_.use_cols_indexes.empty() || not args_.use_cols_names.empty()) { + std::fill(h_column_flags.begin(), h_column_flags.end(), column_parse::disabled); - d_parseCol = h_parseCol; + for (const auto index : args_.use_cols_indexes) { + h_column_flags[index] = column_parse::enabled; + } + num_active_cols = args_.use_cols_indexes.size(); + + for (const auto name : args_.use_cols_names) { + const auto it = std::find(col_names.begin(), col_names.end(), name); + if (it != col_names.end()) { + h_column_flags[it - col_names.begin()] = column_parse::enabled; + num_active_cols++; + } + } + } //----------------------------------------------------------------------------- //--- Auto detect types of the vectors @@ -549,6 +568,7 @@ table reader::Impl::read() vector h_ColumnData(num_active_cols); device_buffer d_ColumnData(num_active_cols); CUDA_TRY(cudaMemset(d_ColumnData.data(), 0, sizeof(column_data_t) * num_active_cols)); + d_column_flags = h_column_flags; launch_dataTypeDetection(d_ColumnData.data()); CUDA_TRY(cudaMemcpy(h_ColumnData.data(), d_ColumnData.data(), sizeof(column_data_t) * num_active_cols, cudaMemcpyDeviceToHost)); @@ -591,30 +611,33 @@ table reader::Impl::read() CUDF_EXPECTS(static_cast(args_.dtype.size()) >= num_actual_cols, "Must specify data types for all columns"); for (int col = 0; col < num_actual_cols; col++) { - if (h_parseCol[col]) { + if (h_column_flags[col] & column_parse::enabled) { // dtype is an array of types, assign types to active columns in the given order - dtypes.push_back(convertStringToDtype(args_.dtype[col])); + const auto dtype_info = get_dtype_info(args_.dtype[col]); + dtypes.push_back(dtype_info.first); + h_column_flags[col] |= dtype_info.second; CUDF_EXPECTS(dtypes.back() != GDF_invalid, "Unsupported data type"); } } } else { - // dtype is a column name->type dictionary, create a map from the dtype array to speed up processing - std::unordered_map col_type_map; - for (const std::string & dtype: args_.dtype) { - // Split the dtype elements around the last ':' character - const size_t colon_idx = dtype.find_last_of(':'); - const std::string col(dtype, 0, colon_idx); - const std::string type(dtype, colon_idx + 1); - - col_type_map[col] = convertStringToDtype(type); - CUDF_EXPECTS(col_type_map[col] != GDF_invalid, "Unsupported data type"); + // Translate vector of `name : dtype` strings to map + // NOTE: Incoming pairs can be out-of-order from column names in dataset + std::unordered_map col_type_map; + for (const auto& pair : args_.dtype) { + const auto pos = pair.find_last_of(':'); + const auto name = pair.substr(0, pos); + const auto dtype = pair.substr(pos + 1, pair.size()); + col_type_map[name] = dtype; } for (int col = 0; col < num_actual_cols; col++) { - if (h_parseCol[col]) { + if (h_column_flags[col] & column_parse::enabled) { CUDF_EXPECTS(col_type_map.find(col_names[col]) != col_type_map.end(), - "Must specify data types for all active columns"); - dtypes.push_back(col_type_map[col_names[col]]); + "Must specify data types for all active columns"); + const auto dtype_info = get_dtype_info(col_type_map[col_names[col]]); + dtypes.push_back(dtype_info.first); + h_column_flags[col] |= dtype_info.second; + CUDF_EXPECTS(dtypes.back() != GDF_invalid, "Unsupported data type"); } } } @@ -622,7 +645,7 @@ table reader::Impl::read() // Alloc output; columns' data memory is still expected for empty dataframe std::vector columns; for (int col = 0, active_col = 0; col < num_actual_cols; ++col) { - if (h_parseCol[col]) { + if (h_column_flags[col] & column_parse::enabled) { columns.emplace_back(num_records, dtypes[active_col], gdf_dtype_extra_info{TIME_UNIT_NONE}, col_names[col]); @@ -647,6 +670,7 @@ table reader::Impl::read() rmm::device_vector d_data = h_data; rmm::device_vector d_valid = h_valid; rmm::device_vector d_valid_counts(num_active_cols, 0); + d_column_flags = h_column_flags; launch_dataConvertColumns(d_data.data().get(), d_valid.data().get(), d_dtypes.data().get(), d_valid_counts.data().get()); @@ -806,26 +830,24 @@ void reader::Impl::uploadDataToDevice(const char *h_uncomp_data, size_t h_uncomp * * @param[out] gdf The output column data * @param[out] valid The bitmaps indicating whether column fields are valid - * @param[out] str_cols The start/end offsets for string data types + * @param[in] d_dtypes The data types of the columns * @param[out] num_valid The numbers of valid fields in columns - * - * @return void *---------------------------------------------------------------------------**/ - void reader::Impl::launch_dataConvertColumns(void **gdf, - gdf_valid_type **valid, gdf_dtype *d_dtypes, - gdf_size_type *num_valid) { +void reader::Impl::launch_dataConvertColumns(void **gdf, gdf_valid_type **valid, + gdf_dtype *d_dtypes, + gdf_size_type *num_valid) { int blockSize; // suggested thread count to use int minGridSize; // minimum block count required CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, convertCsvToGdf)); // Calculate actual block count to use based on records count - int gridSize = (num_records + blockSize - 1) / blockSize; + const int gridSize = (num_records + blockSize - 1) / blockSize; convertCsvToGdf <<< gridSize, blockSize >>> ( - data.data(), opts, num_records, - num_actual_cols, d_parseCol.data().get(), recStart.data(), - d_dtypes, gdf, valid, num_valid); + data.data(), opts, num_records, num_actual_cols, + d_column_flags.data().get(), recStart.data(), d_dtypes, gdf, valid, + num_valid); CUDA_TRY(cudaGetLastError()); } @@ -846,7 +868,7 @@ struct ConvertFunctor { typename std::enable_if_t::value> * = nullptr> __host__ __device__ __forceinline__ void operator()( const char *csvData, void *gdfColumnData, long rowIndex, long start, - long end, const ParseOptions &opts) { + long end, const ParseOptions &opts, column_parse::flags flags) { T &value{static_cast(gdfColumnData)[rowIndex]}; // Check for user-specified true/false values first, where the output is @@ -857,7 +879,11 @@ struct ConvertFunctor { } else if (serializedTrieContains(opts.falseValuesTrie, csvData + start, field_len)) { value = 0; } else { - value = convertStrToValue(csvData, start, end, opts); + if (flags & column_parse::as_hexadecimal) { + value = convertStrToValue(csvData, start, end, opts); + } else { + value = convertStrToValue(csvData, start, end, opts); + } } } @@ -869,7 +895,7 @@ struct ConvertFunctor { typename std::enable_if_t::value> * = nullptr> __host__ __device__ __forceinline__ void operator()( const char *csvData, void *gdfColumnData, long rowIndex, long start, - long end, const ParseOptions &opts) { + long end, const ParseOptions &opts, column_parse::flags flags) { T &value{static_cast(gdfColumnData)[rowIndex]}; value = convertStrToValue(csvData, start, end, opts); } @@ -884,22 +910,20 @@ struct ConvertFunctor { * @param[in] opts A set of parsing options * @param[in] num_records The number of lines/rows of CSV data * @param[in] num_columns The number of columns of CSV data - * @param[in] parseCol Whether to parse or skip a column + * @param[in] column_flags Per-column parsing behavior flags * @param[in] recStart The start the CSV data of interest * @param[in] dtype The data type of the column - * @param[out] gdf_data The output column data + * @param[out] data The output column data * @param[out] valid The bitmaps indicating whether column fields are valid * @param[out] num_valid The numbers of valid fields in columns - * - * @return void *---------------------------------------------------------------------------**/ __global__ void convertCsvToGdf(char *raw_csv, const ParseOptions opts, gdf_size_type num_records, int num_columns, - bool *parseCol, uint64_t *recStart, - gdf_dtype *dtype, void **gdf_data, + column_parse::flags *flags, uint64_t *recStart, + gdf_dtype *dtype, void **data, gdf_valid_type **valid, - gdf_size_type *num_valid) -{ + gdf_size_type *num_valid) { + // thread IDs range per block, so also need the block id long rec_id = threadIdx.x + (blockDim.x * blockIdx.x); // this is entry into the field array - tid is an elements within the num_entries array @@ -921,7 +945,7 @@ __global__ void convertCsvToGdf(char *raw_csv, const ParseOptions opts, pos = seekFieldEnd(raw_csv, opts, pos, stop); - if(parseCol[col]==true){ + if (flags[col] & column_parse::enabled) { // check if the entire field is a NaN string - consistent with pandas const bool is_na = serializedTrieContains(opts.naValuesTrie, raw_csv + start, pos - start); @@ -943,13 +967,13 @@ __global__ void convertCsvToGdf(char *raw_csv, const ParseOptions opts, end--; } } - auto str_list = static_cast(gdf_data[actual_col]); + auto str_list = static_cast(data[actual_col]); str_list[rec_id].first = raw_csv + start; str_list[rec_id].second = end - start; } else { cudf::type_dispatcher( dtype[actual_col], ConvertFunctor{}, raw_csv, - gdf_data[actual_col], rec_id, start, tempPos, opts); + data[actual_col], rec_id, start, tempPos, opts, flags[col]); } // set the valid bitmap - all bits were set to 0 to start @@ -957,7 +981,7 @@ __global__ void convertCsvToGdf(char *raw_csv, const ParseOptions opts, atomicAdd(&num_valid[actual_col], 1); } else if(dtype[actual_col]==gdf_dtype::GDF_STRING){ - auto str_list = static_cast(gdf_data[actual_col]); + auto str_list = static_cast(data[actual_col]); str_list[rec_id].first = nullptr; str_list[rec_id].second = 0; } @@ -985,12 +1009,11 @@ __global__ void convertCsvToGdf(char *raw_csv, const ParseOptions opts, dataTypeDetection)); // Calculate actual block count to use based on records count - int gridSize = (num_records + blockSize - 1) / blockSize; + const int gridSize = (num_records + blockSize - 1) / blockSize; dataTypeDetection <<< gridSize, blockSize >>> ( - data.data(), opts, num_records, - num_actual_cols, d_parseCol.data().get(), recStart.data(), - d_columnData); + data.data(), opts, num_records, num_actual_cols, + d_column_flags.data().get(), recStart.data(), d_columnData); CUDA_TRY(cudaGetLastError()); } @@ -1005,14 +1028,16 @@ __global__ void convertCsvToGdf(char *raw_csv, const ParseOptions opts, * @param[in] opts A set of parsing options * @param[in] num_records The number of lines/rows of CSV data * @param[in] num_columns The number of columns of CSV data - * @param[in] parseCol Whether to parse or skip a column + * @param[in] column_flags Per-column parsing behavior flags * @param[in] recStart The start the CSV data of interest * @param[out] d_columnData The count for each column data type *---------------------------------------------------------------------------**/ __global__ void dataTypeDetection(char *raw_csv, const ParseOptions opts, gdf_size_type num_records, int num_columns, - bool *parseCol, uint64_t *recStart, + column_parse::flags *flags, + uint64_t *recStart, column_data_t *d_columnData) { + // ThreadIds range per block, so also need the blockId // This is entry into the fields; threadId is an element within `num_records` long rec_id = threadIdx.x + (blockDim.x * blockIdx.x); @@ -1038,7 +1063,7 @@ __global__ void dataTypeDetection(char *raw_csv, const ParseOptions opts, pos = seekFieldEnd(raw_csv, opts, pos, stop); // Checking if this is a column that the user wants --- user can filter columns - if (parseCol[col] == true) { + if (flags[col] & column_parse::enabled) { long tempPos = pos - 1; long field_len = pos - start; @@ -1066,14 +1091,8 @@ __global__ void dataTypeDetection(char *raw_csv, const ParseOptions opts, adjustForWhitespaceAndQuotes(raw_csv, &start, &tempPos); field_len = tempPos - start + 1; - const bool maybe_hex = - ((field_len > 2 && raw_csv[start] == '0' && - raw_csv[start + 1] == 'x') || - (field_len > 3 && raw_csv[start] == '-' && - raw_csv[start + 1] == '0' && raw_csv[start + 2] == 'x')); - for (long startPos = start; startPos <= tempPos; startPos++) { - if (isDigit(raw_csv[startPos], maybe_hex)) { + if (isDigit(raw_csv[startPos])) { countNumber++; continue; } @@ -1096,7 +1115,7 @@ __global__ void dataTypeDetection(char *raw_csv, const ParseOptions opts, break; case 'e': case 'E': - if (!maybe_hex && startPos > start && startPos < tempPos) + if (startPos > start && startPos < tempPos) countExponent++; break; default: @@ -1112,10 +1131,6 @@ __global__ void dataTypeDetection(char *raw_csv, const ParseOptions opts, if ((raw_csv[start] == '-' || raw_csv[start] == '+') && field_len > 1) { --int_req_number_cnt; } - // Off by one if they are a hexadecimal number - if (maybe_hex) { - --int_req_number_cnt; - } if (field_len == 0) { // Ignoring whitespace and quotes can result in empty fields diff --git a/cpp/src/io/csv/csv_reader_impl.hpp b/cpp/src/io/csv/csv_reader_impl.hpp index e9c734e2e65..30255ff8358 100644 --- a/cpp/src/io/csv/csv_reader_impl.hpp +++ b/cpp/src/io/csv/csv_reader_impl.hpp @@ -59,10 +59,10 @@ class reader::Impl { int num_actual_cols = 0; ///< Number of columns in the file --- based on the number of columns in header. // Parsing options - ParseOptions opts{}; ///< Options to control parsing behavior - thrust::host_vector h_parseCol; ///< Array of booleans stating if column should be parsed in reading. - // process: parseCol[x]=false means that the column x needs to be filtered out. - rmm::device_vector d_parseCol; ///< device : array of booleans stating if column should be parsed in reading + ParseOptions opts{}; ///< Whole dataset parsing options + thrust::host_vector h_column_flags; ///< Per-column parsing flags + rmm::device_vector d_column_flags; ///< Per-column parsing flags (device memory) + rmm::device_vector d_trueTrie; ///< device: serialized trie of values to recognize as true rmm::device_vector d_falseTrie; ///< device: serialized trie of values to recognize as false rmm::device_vector d_naTrie; ///< device: serialized trie of NA values diff --git a/cpp/src/io/csv/type_conversion.cuh b/cpp/src/io/csv/type_conversion.cuh index d51a948fd19..f748d3f670f 100644 --- a/cpp/src/io/csv/type_conversion.cuh +++ b/cpp/src/io/csv/type_conversion.cuh @@ -169,71 +169,66 @@ struct ParseOptions { }; /** -* @brief Specialization of determineBase for integral types. Checks if the -* string represents a hex value and updates the starting position if it does. -*/ -template ::value>* = nullptr> -__device__ __forceinline__ int determineBase(const char* data, long* start, - long end) { - // check if this is a hex number - if (end - *start >= 2 && data[*start] == '0' && data[*start + 1] == 'x') { - *start += 2; - return 16; - } - return 10; -} - -/** - * @brief Specialization of determineBase for non-integral numeric types. - * Always returns 10, only decimal floating-point numbers are supported. - */ -template ::value>* = nullptr> -__device__ __forceinline__ int determineBase(const char* data, long* start, - long end) { - return 10; -} + * @brief Per-column parsing flags used for dtype detection and data conversion + **/ +namespace column_parse { +enum : uint8_t { + disabled = 0, ///< data is not read + enabled = 1, ///< data is read and parsed as usual + inferred = 2, ///< infer the dtype + as_default = 4, ///< no special decoding + as_hexadecimal = 8, ///< decode with base-16 +}; +using flags = uint8_t; +} // namespace column_parse /** - * @brief Specialization of decodeAsciiDigit for integral types. - * Handles hexadecimal digits, both uppercase and lowercase. + * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization + * for integral types. Handles hexadecimal digits, both uppercase and lowercase. + * If the character is not a valid numeric digit then `0` is returned. + * + * @param[in] c ASCII or UTF-8 character + * + * @return uint8_t Numeric value of the character, or `0` */ template ::value>* = nullptr> -__device__ __forceinline__ char decodeAsciiDigit(char d, int base) { - if (base == 16) { - if (d >= 'a' && d <= 'f') return d - 'a' + 10; - if (d >= 'A' && d <= 'F') return d - 'A' + 10; - } - return d - '0'; +__device__ __forceinline__ uint8_t decode_digit(char c) { + if (c >= '0' && c <= '9') return c - '0'; + if (c >= 'a' && c <= 'f') return c - 'a' + 10; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + return 0; } /** - * @brief Specialization of decodeAsciiDigit for non-integral numeric types. - * Only handles decimal digits. + * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization + * for non-integral types. Handles only decimal digits. Does not check if + * character is a valid numeric value. + * + * @param[in] c ASCII or UTF-8 character + * + * @return uint8_t Numeric value of the character, or `0` */ template ::value>* = nullptr> -__device__ __forceinline__ char decodeAsciiDigit(char d, int base) { - return d - '0'; +__device__ __forceinline__ uint8_t decode_digit(char c) { + return c - '0'; } /**---------------------------------------------------------------------------* - * @brief Default function for extracting a data value from a character string. - * Handles all arithmetic data types; other data types are handled in - * specialized template functions. + * @brief Parses a character string and returns its numeric value. * * @param[in] data The character string for parse * @param[in] start The index within data to start parsing from * @param[in] end The end index within data to end parsing - * @param[in] opts The various parsing behavior options and settings + * @param[in] opts The global parsing behavior options + * @param[in] base Base (radix) to use for conversion * * @return The parsed and converted value *---------------------------------------------------------------------------**/ template -__inline__ __device__ T convertStrToValue(const char* data, long start, long end, - const ParseOptions& opts) { +__inline__ __device__ T parse_numeric(const char* data, long start, long end, + const ParseOptions& opts, int base = 10) { T value = 0; // Handle negative values if necessary @@ -243,19 +238,16 @@ __inline__ __device__ T convertStrToValue(const char* data, long start, long end start++; } - const int base = determineBase(data, &start, end); - // Handle the whole part of the number long index = start; while (index <= end) { if (data[index] == opts.decimal) { ++index; break; - } else if (base == 10 && - (data[index] == 'e' || data[index] == 'E')) { + } else if (base == 10 && (data[index] == 'e' || data[index] == 'E')) { break; } else if (data[index] != opts.thousands && data[index] != '+') { - value = (value * base) + decodeAsciiDigit(data[index], base); + value = (value * base) + decode_digit(data[index]); } ++index; } @@ -269,7 +261,7 @@ __inline__ __device__ T convertStrToValue(const char* data, long start, long end break; } else if (data[index] != opts.thousands && data[index] != '+') { divisor /= base; - value += decodeAsciiDigit(data[index], base) * divisor; + value += decode_digit(data[index]) * divisor; } ++index; } @@ -295,6 +287,18 @@ __inline__ __device__ T convertStrToValue(const char* data, long start, long end return value * sign; } +template +__inline__ __device__ T convertStrToValue(const char* data, long start, + long end, const ParseOptions& opts) { + return parse_numeric(data, start, end, opts, base); +} + +template +__inline__ __device__ T convertStrToValue(const char* data, long start, + long end, const ParseOptions& opts) { + return parse_numeric(data, start, end, opts); +} + template <> __inline__ __device__ cudf::date32 convertStrToValue( const char* data, long start, long end, const ParseOptions& opts) { @@ -317,7 +321,7 @@ __inline__ __device__ cudf::category convertStrToValue( template <> __inline__ __device__ cudf::timestamp convertStrToValue( const char* data, long start, long end, const ParseOptions& opts) { - return cudf::timestamp{convertStrToValue(data, start, end, opts)}; + return cudf::timestamp{parse_numeric(data, start, end, opts)}; } //The purpose of this is merely to allow compilation @@ -342,9 +346,8 @@ __inline__ __device__ cudf::bool8 convertStrToValue( end - start + 1)) { return_value = cudf::false_v; } else { - // Expect 'false_v' or 'true_v' in data, but clamp any non-zero value to 1 - // in case - if (convertStrToValue( + // Expect 'false_v' or 'true_v', but clamp any false value to 1 + if (parse_numeric( data, start, end, opts) != cudf::detail::unwrap(cudf::false_v)) { return_value = cudf::true_v; } else { diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 73a8b8f5e48..970ee81d246 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -68,7 +68,7 @@ __device__ __inline__ void setBitmapBit(gdf_valid_type *bitmap, long bit_idx) { * @brief Returns true is the input character is a valid digit. * Supports both decimal and hexadecimal digits (uppercase and lowercase). */ -__device__ __inline__ bool isDigit(char c, bool is_hex) { +__device__ __inline__ bool isDigit(char c, bool is_hex = false) { if (c >= '0' && c <= '9') return true; if (is_hex) { diff --git a/python/cudf/tests/test_csv.py b/python/cudf/tests/test_csv.py index 4c18974904d..80b7b19345c 100644 --- a/python/cudf/tests/test_csv.py +++ b/python/cudf/tests/test_csv.py @@ -958,21 +958,26 @@ def test_csv_reader_aligned_byte_range(tmpdir): assert(np.count_nonzero(df['zeros']) == 0) -def test_csv_reader_hex_ints(tmpdir): - lines = ['0x0', '-0x1000', '0xfedcba', '0xABCDEF', '0xaBcDeF'] +@pytest.mark.parametrize('pdf_dtype, gdf_dtype', [(None, None), + ('int', 'hex'), + ('int32', 'hex32'), + ('int64', 'hex64')]) +def test_csv_reader_hexadecimals(pdf_dtype, gdf_dtype): + lines = ['0x0', '-0x1000', '0xfedcba', '0xABCDEF', '0xaBcDeF', '9512c20b'] values = [int(hex_int, 16) for hex_int in lines] buffer = '\n'.join(lines) - # with explicit data types - df = read_csv(StringIO(buffer), - dtype=['int32'], names=['hex_int']) - np.testing.assert_array_equal(values, df['hex_int']) - - # with data type inference - df = read_csv(StringIO(buffer), - names=['hex_int']) - np.testing.assert_array_equal(values, df['hex_int']) + if gdf_dtype is not None: + # require explicit `hex` dtype to parse hexadecimals + pdf = pd.DataFrame(data=values, dtype=pdf_dtype, columns=['hex_int']) + gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=['hex_int']) + np.testing.assert_array_equal(pdf['hex_int'], gdf['hex_int']) + else: + # otherwise, dtype inference returns as object (string) + pdf = pd.read_csv(StringIO(buffer), names=['hex_int']) + gdf = read_csv(StringIO(buffer), names=['hex_int']) + assert_eq(pdf, gdf) @pytest.mark.parametrize('quoting', [0, 1, 2, 3])