Skip to content

Commit

Permalink
Merge branch 'branch-0.20' into parquet-writer-spans
Browse files Browse the repository at this point in the history
  • Loading branch information
devavret committed Apr 19, 2021
2 parents 920ba7b + 4893259 commit 3c050bb
Show file tree
Hide file tree
Showing 66 changed files with 465 additions and 1,071 deletions.
4 changes: 2 additions & 2 deletions ci/cpu/prebuild.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@ else
fi

# upload cudf_kafka for all versions of Python
if [[ "$CUDA" == "10.1" ]]; then
if [[ "$CUDA" == "11.0" ]]; then
export UPLOAD_CUDF_KAFKA=1
else
export UPLOAD_CUDF_KAFKA=0
fi

#We only want to upload libcudf_kafka once per python/CUDA combo
if [[ "$PYTHON" == "3.7" ]] && [[ "$CUDA" == "10.1" ]]; then
if [[ "$PYTHON" == "3.7" ]] && [[ "$CUDA" == "11.0" ]]; then
export UPLOAD_LIBCUDF_KAFKA=1
else
export UPLOAD_LIBCUDF_KAFKA=0
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/cudf_dev_cuda11.0.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ dependencies:
- python>=3.7,<3.9
- numba>=0.49,!=0.51.0
- numpy
- pandas>=1.0,<=1.2.4
- pandas>=1.0,<1.3.0dev0
- pyarrow=1.0.1
- fastavro>=0.22.9
- notebook>=0.5.0
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/cudf_dev_cuda11.1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ dependencies:
- python>=3.7,<3.9
- numba>=0.49,!=0.51.0
- numpy
- pandas>=1.0,<=1.2.4
- pandas>=1.0,<1.3.0dev0
- pyarrow=1.0.1
- fastavro>=0.22.9
- notebook>=0.5.0
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/cudf_dev_cuda11.2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ dependencies:
- python>=3.7,<3.9
- numba>=0.49,!=0.51.0
- numpy
- pandas>=1.0,<=1.2.4
- pandas>=1.0,<1.3.0dev0
- pyarrow=1.0.1
- fastavro>=0.22.9
- notebook>=0.5.0
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ requirements:
- protobuf
- python
- typing_extensions
- pandas >=1.0,<=1.2.4
- pandas >=1.0,<1.3.0dev0
- cupy >7.1.0,<9.0.0a0
- numba >=0.49.0
- numpy
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ add_library(cudf
src/io/parquet/reader_impl.cu
src/io/parquet/writer_impl.cu
src/io/statistics/column_stats.cu
src/io/utilities/column_buffer.cpp
src/io/utilities/data_sink.cpp
src/io/utilities/datasource.cpp
src/io/utilities/file_io_utilities.cpp
Expand Down
2 changes: 1 addition & 1 deletion cpp/cmake/thirdparty/CUDF_GetCPM.cmake
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
set(CPM_DOWNLOAD_VERSION 4fad2eac0a3741df3d9c44b791f9163b74aa7b07) # 0.32.0
set(CPM_DOWNLOAD_VERSION 7644c3a40fc7889f8dee53ce21e85dc390b883dc) # v0.32.1

if(CPM_SOURCE_CACHE)
# Expand relative path. This is important if the provided path contains a tilde (~)
Expand Down
3 changes: 1 addition & 2 deletions cpp/include/cudf/io/parquet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -389,10 +389,9 @@ class column_in_metadata {
bool _use_int96_timestamp = false;
// bool _output_as_binary = false;
thrust::optional<uint8_t> _decimal_precision;

public:
std::vector<column_in_metadata> children;

public:
/**
* @brief Set the name of this column
*
Expand Down
18 changes: 2 additions & 16 deletions cpp/include/cudf/strings/string_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -92,20 +92,6 @@ __device__ inline size_type string_view::length() const
{
if (_length == UNKNOWN_STRING_LENGTH)
_length = strings::detail::characters_in_string(_data, _bytes);
if (_length && (_char_width == UNKNOWN_CHAR_WIDTH)) {
uint8_t const* ptr = reinterpret_cast<uint8_t const*>(data());
auto const first = strings::detail::bytes_in_utf8_byte(*ptr);
// see if they are all the same width
_char_width = (thrust::find_if(thrust::seq,
ptr,
ptr + size_bytes(),
[first](auto ch) {
auto width = strings::detail::bytes_in_utf8_byte(ch);
return (width != 0) && (width != first);
})) == (ptr + size_bytes())
? first
: VARIABLE_CHAR_WIDTH;
}
return _length;
}

Expand Down Expand Up @@ -251,7 +237,7 @@ __device__ inline size_type string_view::byte_offset(size_type pos) const
size_type offset = 0;
const char* sptr = _data;
const char* eptr = sptr + _bytes;
if (_char_width > 0) return pos * _char_width;
if (length() == size_bytes()) return pos;
while ((pos > 0) && (sptr < eptr)) {
size_type charbytes = strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(*sptr++));
if (charbytes) --pos;
Expand Down Expand Up @@ -408,7 +394,7 @@ __device__ inline string_view string_view::substr(size_type pos, size_type lengt

__device__ inline size_type string_view::character_offset(size_type bytepos) const
{
if (_char_width > 0) return bytepos / _char_width;
if (length() == size_bytes()) return bytepos;
return strings::detail::characters_in_string(data(), bytepos);
}

Expand Down
18 changes: 5 additions & 13 deletions cpp/include/cudf/strings/string_view.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,6 @@ using char_utf8 = uint32_t; ///< UTF-8 characters are 1-4 bytes
*/
constexpr cudf::size_type UNKNOWN_STRING_LENGTH{-1};

/**
* @brief The char width is initialized to this value as a place-holder.
*
* The byte-width of the characters in a string is computed on-demand.
*/
constexpr int8_t UNKNOWN_CHAR_WIDTH{-1};

/**
* @brief This value is assigned to the _char_width member if the string
* contains characters of different widths.
Expand Down Expand Up @@ -314,7 +307,7 @@ class string_view {
/**
* @brief Default constructor represents an empty string.
*/
CUDA_HOST_DEVICE_CALLABLE string_view() : _data(""), _bytes(0), _length(0), _char_width(0) {}
CUDA_HOST_DEVICE_CALLABLE string_view() : _data(""), _bytes(0), _length(0) {}

/**
* @brief Create instance from existing device char array.
Expand All @@ -323,7 +316,7 @@ class string_view {
* @param bytes Number of bytes in data array.
*/
CUDA_HOST_DEVICE_CALLABLE string_view(const char* data, size_type bytes)
: _data(data), _bytes(bytes), _length(UNKNOWN_STRING_LENGTH), _char_width(UNKNOWN_CHAR_WIDTH)
: _data(data), _bytes(bytes), _length(UNKNOWN_STRING_LENGTH)
{
}

Expand All @@ -334,10 +327,9 @@ class string_view {
string_view& operator=(string_view&&) = default;

private:
const char* _data{}; ///< Pointer to device memory contain char array for this string
size_type _bytes{}; ///< Number of bytes in _data for this string
mutable size_type _length{}; ///< Number of characters in this string (computed)
mutable int8_t _char_width{}; ///< Number of bytes per character if uniform width (computed)
const char* _data{}; ///< Pointer to device memory contain char array for this string
size_type _bytes{}; ///< Number of bytes in _data for this string
mutable size_type _length{}; ///< Number of characters in this string (computed)

/**
* @brief Return the character position of the given byte offset.
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/io/avro/avro_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <stdint.h>
#include <stdio.h>
#include <io/utilities/column_buffer.hpp>

namespace cudf {
namespace io {
Expand Down Expand Up @@ -56,6 +57,8 @@ enum type_kind_e {
type_array,
};

using cudf::io::detail::string_index_pair;

} // namespace avro
} // namespace io
} // namespace cudf
14 changes: 7 additions & 7 deletions cpp/src/io/avro/avro_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
size_t max_rows,
const uint8_t *cur,
const uint8_t *end,
device_span<nvstrdesc_s> global_dictionary)
device_span<string_index_pair> global_dictionary)
{
uint32_t array_start = 0, array_repeat_count = 0;
int array_children = 0;
Expand Down Expand Up @@ -123,17 +123,17 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
if (kind == type_enum) { // dictionary
size_t idx = schema[i].count + v;
if (idx < global_dictionary.size()) {
ptr = global_dictionary[idx].ptr;
count = global_dictionary[idx].count;
ptr = global_dictionary[idx].first;
count = global_dictionary[idx].second;
}
} else if (v >= 0 && cur + v <= end) { // string
ptr = reinterpret_cast<const char *>(cur);
count = (size_t)v;
cur += count;
}
if (dataptr != nullptr && row < max_rows) {
static_cast<nvstrdesc_s *>(dataptr)[row].ptr = ptr;
static_cast<nvstrdesc_s *>(dataptr)[row].count = count;
static_cast<string_index_pair *>(dataptr)[row].first = ptr;
static_cast<string_index_pair *>(dataptr)[row].second = count;
}
}
} break;
Expand Down Expand Up @@ -230,7 +230,7 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
extern "C" __global__ void __launch_bounds__(num_warps * 32, 2)
gpuDecodeAvroColumnData(block_desc_s *blocks,
schemadesc_s *schema_g,
device_span<nvstrdesc_s> global_dictionary,
device_span<string_index_pair> global_dictionary,
const uint8_t *avro_data,
uint32_t num_blocks,
uint32_t schema_len,
Expand Down Expand Up @@ -313,7 +313,7 @@ extern "C" __global__ void __launch_bounds__(num_warps * 32, 2)
*/
void DecodeAvroColumnData(block_desc_s *blocks,
schemadesc_s *schema,
device_span<nvstrdesc_s> global_dictionary,
device_span<string_index_pair> global_dictionary,
const uint8_t *avro_data,
uint32_t num_blocks,
uint32_t schema_len,
Expand Down
9 changes: 1 addition & 8 deletions cpp/src/io/avro/avro_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,6 @@ namespace cudf {
namespace io {
namespace avro {
namespace gpu {
/**
* @brief Struct to describe the output of a string datatype
*/
struct nvstrdesc_s {
const char *ptr;
size_t count;
};

/**
* @brief Struct to describe the avro schema
Expand Down Expand Up @@ -59,7 +52,7 @@ struct schemadesc_s {
*/
void DecodeAvroColumnData(block_desc_s *blocks,
schemadesc_s *schema,
cudf::device_span<nvstrdesc_s> global_dictionary,
cudf::device_span<string_index_pair> global_dictionary,
const uint8_t *avro_data,
uint32_t num_blocks,
uint32_t schema_len,
Expand Down
16 changes: 8 additions & 8 deletions cpp/src/io/avro/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer &comp_

void reader::impl::decode_data(const rmm::device_buffer &block_data,
const std::vector<std::pair<uint32_t, uint32_t>> &dict,
device_span<gpu::nvstrdesc_s> global_dictionary,
device_span<string_index_pair> global_dictionary,
size_t num_rows,
std::vector<std::pair<int, std::string>> selection,
std::vector<column_buffer> &out_buffers,
Expand Down Expand Up @@ -393,10 +393,10 @@ table_with_metadata reader::impl::read(avro_reader_options const &options,
for (const auto &sym : col_schema.symbols) { dictionary_data_size += sym.length(); }
}

rmm::device_uvector<gpu::nvstrdesc_s> d_global_dict(total_dictionary_entries, stream);
rmm::device_uvector<string_index_pair> d_global_dict(total_dictionary_entries, stream);
rmm::device_uvector<char> d_global_dict_data(dictionary_data_size, stream);
if (total_dictionary_entries > 0) {
std::vector<gpu::nvstrdesc_s> h_global_dict(total_dictionary_entries);
std::vector<string_index_pair> h_global_dict(total_dictionary_entries);
std::vector<char> h_global_dict_data(dictionary_data_size);
size_t dict_pos = 0;
for (size_t i = 0; i < column_types.size(); ++i) {
Expand All @@ -406,10 +406,10 @@ table_with_metadata reader::impl::read(avro_reader_options const &options,
for (size_t j = 0; j < dict[i].second; j++) {
auto const &symbols = col_schema.symbols[j];

auto const data_dst = h_global_dict_data.data() + dict_pos;
auto const len = symbols.length();
col_dict_entries[j].ptr = data_dst;
col_dict_entries[j].count = len;
auto const data_dst = h_global_dict_data.data() + dict_pos;
auto const len = symbols.length();
col_dict_entries[j].first = data_dst;
col_dict_entries[j].second = len;

std::copy(symbols.c_str(), symbols.c_str() + len, data_dst);
dict_pos += len;
Expand All @@ -418,7 +418,7 @@ table_with_metadata reader::impl::read(avro_reader_options const &options,

CUDA_TRY(cudaMemcpyAsync(d_global_dict.data(),
h_global_dict.data(),
h_global_dict.size() * sizeof(gpu::nvstrdesc_s),
h_global_dict.size() * sizeof(string_index_pair),
cudaMemcpyDefault,
stream.value()));
CUDA_TRY(cudaMemcpyAsync(d_global_dict_data.data(),
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/avro/reader_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ class reader::impl {
*/
void decode_data(const rmm::device_buffer &block_data,
const std::vector<std::pair<uint32_t, uint32_t>> &dict,
cudf::device_span<gpu::nvstrdesc_s> global_dictionary,
cudf::device_span<string_index_pair> global_dictionary,
size_t num_rows,
std::vector<std::pair<int, std::string>> columns,
std::vector<column_buffer> &out_buffers,
Expand Down
19 changes: 0 additions & 19 deletions cpp/src/io/csv/csv.h

This file was deleted.

4 changes: 2 additions & 2 deletions cpp/src/io/csv/datetime.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#pragma once

#include "thrust/reduce.h"
#include <thrust/reduce.h>

#include <cudf/wrappers/durations.hpp>
#include <io/utilities/parsing_utils.cuh>
Expand Down Expand Up @@ -435,4 +435,4 @@ __inline__ __device__ int64_t to_time_delta(char const* begin, char const* end)
}

} // namespace io
} // namespace cudf
} // namespace cudf
2 changes: 1 addition & 1 deletion cpp/src/io/csv/reader_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#pragma once

#include "csv.h"
#include "csv_common.h"
#include "csv_gpu.h"

#include <cudf/detail/utilities/trie.cuh>
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/csv/writer_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#pragma once

#include "csv.h"
#include "csv_common.h"
#include "csv_gpu.h"

#include <cudf/strings/strings_column_view.hpp>
Expand Down
19 changes: 0 additions & 19 deletions cpp/src/io/json/json.h

This file was deleted.

2 changes: 2 additions & 0 deletions cpp/src/io/json/json_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#pragma once

#include <cudf/types.hpp>
#include <io/utilities/column_buffer.hpp>
#include <io/utilities/column_type_histogram.hpp>

class SerialTrieNode;
using cudf::io::detail::string_index_pair;
Loading

0 comments on commit 3c050bb

Please sign in to comment.