diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a7c34ca489c..cf605fc78dd 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -658,6 +658,21 @@ set_source_files_properties( PROPERTIES COMPILE_DEFINITIONS "_FILE_OFFSET_BITS=64" ) +set_source_files_properties( + src/io/csv/csv_gpu.cu src/io/csv/durations.cu src/io/csv/reader_impl.cu src/io/csv/writer_impl.cu +) + +set_source_files_properties( + src/binaryop/binaryop.cpp + src/jit/cache.cpp + src/rolling/detail/rolling_fixed_window.cu + src/rolling/detail/rolling_variable_window.cu + src/rolling/grouped_rolling.cu + src/rolling/rolling.cu + src/transform/transform.cpp + PROPERTIES COMPILE_DEFINITIONS "_FILE_OFFSET_BITS=64" +) + set_target_properties( cudf PROPERTIES BUILD_RPATH "\$ORIGIN" diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index 248e17669bc..ca7ff243dac 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -179,115 +179,117 @@ __global__ void __launch_bounds__(csvparse_block_dim) // ThreadIds range per block, so also need the blockId // This is entry into the fields; threadId is an element within `num_records` - auto const rec_id = grid_1d::global_thread_id(); - auto const rec_id_next = rec_id + 1; - - // we can have more threads than data, make sure we are not past the end of the data - if (rec_id_next >= row_offsets.size()) { return; } - - auto field_start = raw_csv + row_offsets[rec_id]; - auto const row_end = raw_csv + row_offsets[rec_id_next]; - - auto next_field = field_start; - int col = 0; - int actual_col = 0; - // Going through all the columns of a given record - while (col < column_flags.size() && field_start < row_end) { - auto next_delimiter = cudf::io::gpu::seek_field_end(field_start, row_end, opts); - - // Checking if this is a column that the user wants --- user can filter columns - if (column_flags[col] & column_parse::inferred) { - // points to last character in the field - auto const field_len = static_cast(next_delimiter - field_start); - if (serialized_trie_contains(opts.trie_na, {field_start, field_len})) { - atomicAdd(&d_column_data[actual_col].null_count, 1); - } else if (serialized_trie_contains(opts.trie_true, {field_start, field_len}) || - serialized_trie_contains(opts.trie_false, {field_start, field_len})) { - atomicAdd(&d_column_data[actual_col].bool_count, 1); - } else if (cudf::io::is_infinity(field_start, next_delimiter)) { - atomicAdd(&d_column_data[actual_col].float_count, 1); - } else { - long count_number = 0; - long count_decimal = 0; - long count_thousands = 0; - long count_slash = 0; - long count_dash = 0; - long count_plus = 0; - long count_colon = 0; - long count_string = 0; - long count_exponent = 0; - - // Modify field_start & end to ignore whitespace and quotechars - // This could possibly result in additional empty fields - auto const trimmed_field_range = trim_whitespaces_quotes(field_start, next_delimiter); - auto const trimmed_field_len = trimmed_field_range.second - trimmed_field_range.first; - - for (auto cur = trimmed_field_range.first; cur < trimmed_field_range.second; ++cur) { - if (is_digit(*cur)) { - count_number++; - continue; - } - if (*cur == opts.decimal) { - count_decimal++; - continue; - } - if (*cur == opts.thousands) { - count_thousands++; - continue; - } - // Looking for unique characters that will help identify column types. - switch (*cur) { - case '-': count_dash++; break; - case '+': count_plus++; break; - case '/': count_slash++; break; - case ':': count_colon++; break; - case 'e': - case 'E': - if (cur > trimmed_field_range.first && cur < trimmed_field_range.second - 1) - count_exponent++; - break; - default: count_string++; break; + + for (auto rec_id = grid_1d::global_thread_id(); rec_id < row_offsets.size(); + rec_id += grid_1d::grid_stride()) { + // we can have more threads than data, make sure we are not past the end of the data + auto const rec_id_next = rec_id + 1; + + if (rec_id_next >= row_offsets.size()) { return; } + + auto field_start = raw_csv + row_offsets[rec_id]; + auto const row_end = raw_csv + row_offsets[rec_id_next]; + + auto next_field = field_start; + int col = 0; + int actual_col = 0; + while (col < column_flags.size() && field_start < row_end) { + auto next_delimiter = cudf::io::gpu::seek_field_end(field_start, row_end, opts); + + // Checking if this is a column that the user wants --- user can filter columns + if (column_flags[col] & column_parse::inferred) { + // points to last character in the field + auto const field_len = static_cast(next_delimiter - field_start); + if (serialized_trie_contains(opts.trie_na, {field_start, field_len})) { + atomicAdd(&d_column_data[actual_col].null_count, 1); + } else if (serialized_trie_contains(opts.trie_true, {field_start, field_len}) || + serialized_trie_contains(opts.trie_false, {field_start, field_len})) { + atomicAdd(&d_column_data[actual_col].bool_count, 1); + } else if (cudf::io::is_infinity(field_start, next_delimiter)) { + atomicAdd(&d_column_data[actual_col].float_count, 1); + } else { + long count_number = 0; + long count_decimal = 0; + long count_thousands = 0; + long count_slash = 0; + long count_dash = 0; + long count_plus = 0; + long count_colon = 0; + long count_string = 0; + long count_exponent = 0; + + // Modify field_start & end to ignore whitespace and quotechars + // This could possibly result in additional empty fields + auto const trimmed_field_range = trim_whitespaces_quotes(field_start, next_delimiter); + auto const trimmed_field_len = trimmed_field_range.second - trimmed_field_range.first; + + for (auto cur = trimmed_field_range.first; cur < trimmed_field_range.second; ++cur) { + if (is_digit(*cur)) { + count_number++; + continue; + } + if (*cur == opts.decimal) { + count_decimal++; + continue; + } + if (*cur == opts.thousands) { + count_thousands++; + continue; + } + // Looking for unique characters that will help identify column types. + switch (*cur) { + case '-': count_dash++; break; + case '+': count_plus++; break; + case '/': count_slash++; break; + case ':': count_colon++; break; + case 'e': + case 'E': + if (cur > trimmed_field_range.first && cur < trimmed_field_range.second - 1) + count_exponent++; + break; + default: count_string++; break; + } } - } - // Integers have to have the length of the string - // Off by one if they start with a minus sign - auto const int_req_number_cnt = - trimmed_field_len - count_thousands - - ((*trimmed_field_range.first == '-' || *trimmed_field_range.first == '+') && - trimmed_field_len > 1); - - if (column_flags[col] & column_parse::as_datetime) { - // PANDAS uses `object` dtype if the date is unparseable - if (is_datetime(count_string, count_decimal, count_colon, count_dash, count_slash)) { - atomicAdd(&d_column_data[actual_col].datetime_count, 1); + // Integers have to have the length of the string + // Off by one if they start with a minus sign + auto const int_req_number_cnt = + trimmed_field_len - count_thousands - + ((*trimmed_field_range.first == '-' || *trimmed_field_range.first == '+') && + trimmed_field_len > 1); + + if (column_flags[col] & column_parse::as_datetime) { + // PANDAS uses `object` dtype if the date is unparseable + if (is_datetime(count_string, count_decimal, count_colon, count_dash, count_slash)) { + atomicAdd(&d_column_data[actual_col].datetime_count, 1); + } else { + atomicAdd(&d_column_data[actual_col].string_count, 1); + } + } else if (count_number == int_req_number_cnt) { + auto const is_negative = (*trimmed_field_range.first == '-'); + auto const data_begin = + trimmed_field_range.first + (is_negative || (*trimmed_field_range.first == '+')); + cudf::size_type* ptr = cudf::io::gpu::infer_integral_field_counter( + data_begin, data_begin + count_number, is_negative, d_column_data[actual_col]); + atomicAdd(ptr, 1); + } else if (is_floatingpoint(trimmed_field_len, + count_number, + count_decimal, + count_thousands, + count_dash + count_plus, + count_exponent)) { + atomicAdd(&d_column_data[actual_col].float_count, 1); } else { atomicAdd(&d_column_data[actual_col].string_count, 1); } - } else if (count_number == int_req_number_cnt) { - auto const is_negative = (*trimmed_field_range.first == '-'); - auto const data_begin = - trimmed_field_range.first + (is_negative || (*trimmed_field_range.first == '+')); - cudf::size_type* ptr = cudf::io::gpu::infer_integral_field_counter( - data_begin, data_begin + count_number, is_negative, d_column_data[actual_col]); - atomicAdd(ptr, 1); - } else if (is_floatingpoint(trimmed_field_len, - count_number, - count_decimal, - count_thousands, - count_dash + count_plus, - count_exponent)) { - atomicAdd(&d_column_data[actual_col].float_count, 1); - } else { - atomicAdd(&d_column_data[actual_col].string_count, 1); } + actual_col++; } - actual_col++; + next_field = next_delimiter + 1; + field_start = next_field; + col++; } - next_field = next_delimiter + 1; - field_start = next_field; - col++; } }