Skip to content

Commit

Permalink
Remove nvstrdesc_s from cuio (#7841)
Browse files Browse the repository at this point in the history
Fixes #5682.

- Structure `nvstrdesc_s` was replaced with `thrust::pair<const char*, size_type>;`.
- `nvstrdesc_s` related logical functions such as `nvstr_is_lesser`, `nvstr_is_greater` etc. were removed.
- Include directives for headers included by source files residing in the same directory were made relative as per the developer guide.
- `make_column` function related to `column_buffer` was moved from a header file to an implementation file.

Authors:
  - Kumar Aatish (https://github.com/kaatish)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - https://github.com/nvdbaranec
  - Devavret Makkar (https://github.com/devavret)
  - Keith Kraus (https://github.com/kkraus14)

URL: #7841
  • Loading branch information
kaatish authored Apr 17, 2021
1 parent 4da38a6 commit 1d03186
Show file tree
Hide file tree
Showing 42 changed files with 255 additions and 327 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ add_library(cudf
src/io/parquet/reader_impl.cu
src/io/parquet/writer_impl.cu
src/io/statistics/column_stats.cu
src/io/utilities/column_buffer.cpp
src/io/utilities/data_sink.cpp
src/io/utilities/datasource.cpp
src/io/utilities/file_io_utilities.cpp
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/io/avro/avro_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <stdint.h>
#include <stdio.h>
#include <io/utilities/column_buffer.hpp>

namespace cudf {
namespace io {
Expand Down Expand Up @@ -56,6 +57,8 @@ enum type_kind_e {
type_array,
};

using cudf::io::detail::string_index_pair;

} // namespace avro
} // namespace io
} // namespace cudf
14 changes: 7 additions & 7 deletions cpp/src/io/avro/avro_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
size_t max_rows,
const uint8_t *cur,
const uint8_t *end,
device_span<nvstrdesc_s> global_dictionary)
device_span<string_index_pair> global_dictionary)
{
uint32_t array_start = 0, array_repeat_count = 0;
int array_children = 0;
Expand Down Expand Up @@ -123,17 +123,17 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
if (kind == type_enum) { // dictionary
size_t idx = schema[i].count + v;
if (idx < global_dictionary.size()) {
ptr = global_dictionary[idx].ptr;
count = global_dictionary[idx].count;
ptr = global_dictionary[idx].first;
count = global_dictionary[idx].second;
}
} else if (v >= 0 && cur + v <= end) { // string
ptr = reinterpret_cast<const char *>(cur);
count = (size_t)v;
cur += count;
}
if (dataptr != nullptr && row < max_rows) {
static_cast<nvstrdesc_s *>(dataptr)[row].ptr = ptr;
static_cast<nvstrdesc_s *>(dataptr)[row].count = count;
static_cast<string_index_pair *>(dataptr)[row].first = ptr;
static_cast<string_index_pair *>(dataptr)[row].second = count;
}
}
} break;
Expand Down Expand Up @@ -230,7 +230,7 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
extern "C" __global__ void __launch_bounds__(num_warps * 32, 2)
gpuDecodeAvroColumnData(block_desc_s *blocks,
schemadesc_s *schema_g,
device_span<nvstrdesc_s> global_dictionary,
device_span<string_index_pair> global_dictionary,
const uint8_t *avro_data,
uint32_t num_blocks,
uint32_t schema_len,
Expand Down Expand Up @@ -313,7 +313,7 @@ extern "C" __global__ void __launch_bounds__(num_warps * 32, 2)
*/
void DecodeAvroColumnData(block_desc_s *blocks,
schemadesc_s *schema,
device_span<nvstrdesc_s> global_dictionary,
device_span<string_index_pair> global_dictionary,
const uint8_t *avro_data,
uint32_t num_blocks,
uint32_t schema_len,
Expand Down
9 changes: 1 addition & 8 deletions cpp/src/io/avro/avro_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,6 @@ namespace cudf {
namespace io {
namespace avro {
namespace gpu {
/**
* @brief Struct to describe the output of a string datatype
*/
struct nvstrdesc_s {
const char *ptr;
size_t count;
};

/**
* @brief Struct to describe the avro schema
Expand Down Expand Up @@ -59,7 +52,7 @@ struct schemadesc_s {
*/
void DecodeAvroColumnData(block_desc_s *blocks,
schemadesc_s *schema,
cudf::device_span<nvstrdesc_s> global_dictionary,
cudf::device_span<string_index_pair> global_dictionary,
const uint8_t *avro_data,
uint32_t num_blocks,
uint32_t schema_len,
Expand Down
16 changes: 8 additions & 8 deletions cpp/src/io/avro/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer &comp_

void reader::impl::decode_data(const rmm::device_buffer &block_data,
const std::vector<std::pair<uint32_t, uint32_t>> &dict,
device_span<gpu::nvstrdesc_s> global_dictionary,
device_span<string_index_pair> global_dictionary,
size_t num_rows,
std::vector<std::pair<int, std::string>> selection,
std::vector<column_buffer> &out_buffers,
Expand Down Expand Up @@ -393,10 +393,10 @@ table_with_metadata reader::impl::read(avro_reader_options const &options,
for (const auto &sym : col_schema.symbols) { dictionary_data_size += sym.length(); }
}

rmm::device_uvector<gpu::nvstrdesc_s> d_global_dict(total_dictionary_entries, stream);
rmm::device_uvector<string_index_pair> d_global_dict(total_dictionary_entries, stream);
rmm::device_uvector<char> d_global_dict_data(dictionary_data_size, stream);
if (total_dictionary_entries > 0) {
std::vector<gpu::nvstrdesc_s> h_global_dict(total_dictionary_entries);
std::vector<string_index_pair> h_global_dict(total_dictionary_entries);
std::vector<char> h_global_dict_data(dictionary_data_size);
size_t dict_pos = 0;
for (size_t i = 0; i < column_types.size(); ++i) {
Expand All @@ -406,10 +406,10 @@ table_with_metadata reader::impl::read(avro_reader_options const &options,
for (size_t j = 0; j < dict[i].second; j++) {
auto const &symbols = col_schema.symbols[j];

auto const data_dst = h_global_dict_data.data() + dict_pos;
auto const len = symbols.length();
col_dict_entries[j].ptr = data_dst;
col_dict_entries[j].count = len;
auto const data_dst = h_global_dict_data.data() + dict_pos;
auto const len = symbols.length();
col_dict_entries[j].first = data_dst;
col_dict_entries[j].second = len;

std::copy(symbols.c_str(), symbols.c_str() + len, data_dst);
dict_pos += len;
Expand All @@ -418,7 +418,7 @@ table_with_metadata reader::impl::read(avro_reader_options const &options,

CUDA_TRY(cudaMemcpyAsync(d_global_dict.data(),
h_global_dict.data(),
h_global_dict.size() * sizeof(gpu::nvstrdesc_s),
h_global_dict.size() * sizeof(string_index_pair),
cudaMemcpyDefault,
stream.value()));
CUDA_TRY(cudaMemcpyAsync(d_global_dict_data.data(),
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/avro/reader_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ class reader::impl {
*/
void decode_data(const rmm::device_buffer &block_data,
const std::vector<std::pair<uint32_t, uint32_t>> &dict,
cudf::device_span<gpu::nvstrdesc_s> global_dictionary,
cudf::device_span<string_index_pair> global_dictionary,
size_t num_rows,
std::vector<std::pair<int, std::string>> columns,
std::vector<column_buffer> &out_buffers,
Expand Down
19 changes: 0 additions & 19 deletions cpp/src/io/csv/csv.h

This file was deleted.

4 changes: 2 additions & 2 deletions cpp/src/io/csv/datetime.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#pragma once

#include "thrust/reduce.h"
#include <thrust/reduce.h>

#include <cudf/wrappers/durations.hpp>
#include <io/utilities/parsing_utils.cuh>
Expand Down Expand Up @@ -435,4 +435,4 @@ __inline__ __device__ int64_t to_time_delta(char const* begin, char const* end)
}

} // namespace io
} // namespace cudf
} // namespace cudf
2 changes: 1 addition & 1 deletion cpp/src/io/csv/reader_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#pragma once

#include "csv.h"
#include "csv_common.h"
#include "csv_gpu.h"

#include <cudf/detail/utilities/trie.cuh>
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/csv/writer_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#pragma once

#include "csv.h"
#include "csv_common.h"
#include "csv_gpu.h"

#include <cudf/strings/strings_column_view.hpp>
Expand Down
19 changes: 0 additions & 19 deletions cpp/src/io/json/json.h

This file was deleted.

2 changes: 2 additions & 0 deletions cpp/src/io/json/json_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#pragma once

#include <cudf/types.hpp>
#include <io/utilities/column_buffer.hpp>
#include <io/utilities/column_type_histogram.hpp>

class SerialTrieNode;
using cudf::io::detail::string_index_pair;
6 changes: 2 additions & 4 deletions cpp/src/io/json/json_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ namespace json {
namespace gpu {
using namespace ::cudf;

using string_pair = std::pair<const char *, size_t>;

namespace {
/**
* @brief CUDA Kernel that adjusts the row range to exclude the character outside of the top level
Expand Down Expand Up @@ -516,7 +514,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
if (!serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) {
// Type dispatcher does not handle strings
if (column_types[desc.column].id() == type_id::STRING) {
auto str_list = static_cast<string_pair *>(output_columns[desc.column]);
auto str_list = static_cast<string_index_pair *>(output_columns[desc.column]);
str_list[rec_id].first = desc.value_begin;
str_list[rec_id].second = value_len;

Expand All @@ -537,7 +535,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
}
}
} else if (column_types[desc.column].id() == type_id::STRING) {
auto str_list = static_cast<string_pair *>(output_columns[desc.column]);
auto str_list = static_cast<string_index_pair *>(output_columns[desc.column]);
str_list[rec_id].first = nullptr;
str_list[rec_id].second = 0;
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/json/json_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@

#pragma once

#include <io/json/json_common.h>
#include <io/utilities/parsing_utils.cuh>
#include "json_common.h"

#include <hash/concurrent_unordered_map.cuh>

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/json/reader_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

#pragma once

#include "json.h"
#include "json_common.h"
#include "json_gpu.h"

#include <io/utilities/column_buffer.hpp>
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/io/orc/orc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@
* limitations under the License.
*/

#include <io/orc/orc.h>
#include <io/orc/orc_field_reader.hpp>
#include <io/orc/orc_field_writer.hpp>
#include "orc.h"
#include <string>
#include "orc_field_reader.hpp"
#include "orc_field_writer.hpp"

namespace cudf {
namespace io {
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/orc/orc_field_reader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
*/
#pragma once

#include <io/orc/orc.h>
#include <string>
#include "orc.h"

/**
* @file orc_field_reader.hpp
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/orc/orc_field_writer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
*/
#pragma once

#include <io/orc/orc.h>
#include <numeric>
#include <string>
#include "orc.h"

/**
* @file orc_field_writer.hpp
Expand Down
Loading

0 comments on commit 1d03186

Please sign in to comment.