Skip to content

Commit

Permalink
Merge branch 'branch-0.18' of github.com:rapidsai/cudf into avro-tests
Browse files Browse the repository at this point in the history
  • Loading branch information
cwharris committed Jan 27, 2021
2 parents 2e47499 + fc40c52 commit 8f1f842
Show file tree
Hide file tree
Showing 42 changed files with 1,712 additions and 904 deletions.
9 changes: 9 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,15 @@ repos:
language: system
files: \.(cu|cuh|h|hpp|cpp|inl)$
args: ['-fallback-style=none']
- repo: local
hooks:
- id: mypy
name: mypy
description: mypy
pass_filenames: false
entry: mypy --config-file=python/cudf/setup.cfg python/cudf/cudf
language: system
types: [python]

default_language_version:
python: python3
14 changes: 13 additions & 1 deletion ci/checks/style.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ FLAKE_RETVAL=$?
FLAKE_CYTHON=`flake8 --config=python/.flake8.cython`
FLAKE_CYTHON_RETVAL=$?

# Run mypy and get results/return code
MYPY_CUDF=`mypy --config=python/cudf/setup.cfg python/cudf/cudf`
MYPY_CUDF_RETVAL=$?

# Run clang-format and check for a consistent code format
CLANG_FORMAT=`python cpp/scripts/run-clang-format.py 2>&1`
CLANG_FORMAT_RETVAL=$?
Expand Down Expand Up @@ -66,6 +70,14 @@ else
echo -e "\n\n>>>> PASSED: flake8-cython style check\n\n"
fi

if [ "$MYPY_CUDF_RETVAL" != "0" ]; then
echo -e "\n\n>>>> FAILED: mypy style check; begin output\n\n"
echo -e "$MYPY_CUDF"
echo -e "\n\n>>>> FAILED: mypy style check; end output\n\n"
else
echo -e "\n\n>>>> PASSED: mypy style check\n\n"
fi

if [ "$CLANG_FORMAT_RETVAL" != "0" ]; then
echo -e "\n\n>>>> FAILED: clang format check; begin output\n\n"
echo -e "$CLANG_FORMAT"
Expand All @@ -79,7 +91,7 @@ HEADER_META=`ci/checks/headers_test.sh`
HEADER_META_RETVAL=$?
echo -e "$HEADER_META"

RETVALS=($ISORT_RETVAL $BLACK_RETVAL $FLAKE_RETVAL $FLAKE_CYTHON_RETVAL $CLANG_FORMAT_RETVAL $HEADER_META_RETVAL)
RETVALS=($ISORT_RETVAL $BLACK_RETVAL $FLAKE_RETVAL $FLAKE_CYTHON_RETVAL $CLANG_FORMAT_RETVAL $HEADER_META_RETVAL $MYPY_CUDF_RETVAL)
IFS=$'\n'
RETVAL=`echo "${RETVALS[*]}" | sort -nr | head -n1`

Expand Down
2 changes: 2 additions & 0 deletions conda/environments/cudf_dev_cuda10.1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ dependencies:
- flake8=3.8.3
- black=19.10
- isort=5.0.7
- mypy=0.782
- typing_extensions
- pre_commit
- dask>=2.22.0
- distributed>=2.22.0
Expand Down
2 changes: 2 additions & 0 deletions conda/environments/cudf_dev_cuda10.2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ dependencies:
- flake8=3.8.3
- black=19.10
- isort=5.0.7
- mypy=0.782
- typing_extensions
- pre_commit
- dask>=2.22.0
- distributed>=2.22.0
Expand Down
2 changes: 2 additions & 0 deletions conda/environments/cudf_dev_cuda11.0.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ dependencies:
- flake8=3.8.3
- black=19.10
- isort=5.0.7
- mypy=0.782
- typing_extensions
- pre_commit
- dask>=2.22.0
- distributed>=2.22.0
Expand Down
1 change: 1 addition & 0 deletions conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ requirements:
run:
- protobuf
- python
- typing_extensions
- pandas >=1.0,<1.2.0dev0
- cupy >7.1.0,<9.0.0a0
- numba >=0.49.0
Expand Down
9 changes: 4 additions & 5 deletions cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,9 @@ void BM_parq_write_varying_inout(benchmark::State& state)

void BM_parq_write_varying_options(benchmark::State& state)
{
auto const compression = static_cast<cudf::io::compression_type>(state.range(0));
auto const enable_stats = static_cast<cudf::io::statistics_freq>(state.range(1));
auto const output_metadata = state.range(2) != 0;
auto const compression = static_cast<cudf::io::compression_type>(state.range(0));
auto const enable_stats = static_cast<cudf::io::statistics_freq>(state.range(1));
auto const file_path = state.range(2) != 0 ? "unused_path.parquet" : "";

auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
int32_t(type_group_id::FLOATING_POINT),
Expand All @@ -82,8 +82,7 @@ void BM_parq_write_varying_options(benchmark::State& state)
cudf_io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
.compression(compression)
.stats_level(enable_stats)
.return_filemetadata(output_metadata)
.column_chunks_file_path("dummy_path.parquet");
.column_chunks_file_path(file_path);
cudf_io::write_parquet(options);
}

Expand Down
25 changes: 18 additions & 7 deletions cpp/src/sort/sort_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,17 @@ namespace {
* @brief Type-dispatched functor for sorting a single column.
*/
struct column_sorted_order_fn {
/**
* @brief Compile time check for allowing radix sort for column type.
*
* Floating point is removed here for special handling of NaNs.
*/
template <typename T>
static constexpr bool is_radix_sort_supported()
{
return cudf::is_fixed_width<T>() && !cudf::is_floating_point<T>();
}

/**
* @brief Sorts fixed-width columns using faster thrust sort.
*
Expand All @@ -32,15 +43,15 @@ struct column_sorted_order_fn {
* @param ascending True if sort order is ascending
* @param stream CUDA stream used for device memory operations and kernel launches
*/
template <typename T, typename std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
template <typename T, typename std::enable_if_t<is_radix_sort_supported<T>()>* = nullptr>
void radix_sort(column_view const& input,
mutable_column_view& indices,
bool ascending,
rmm::cuda_stream_view stream)
{
// A non-stable sort on a fixed-width column with no nulls will use a radix sort
// if using only the thrust::less or thrust::greater comparators but also
// requires making a copy of the input data.
// A non-stable sort on a column of arithmetic type with no nulls will use a radix sort
// if specifying only the `thrust::less` or `thrust::greater` comparators.
// But this also requires making a copy of the input data.
auto temp_col = column(input, stream);
auto d_col = temp_col.mutable_view();
using DeviceT = device_storage_type_t<T>;
Expand All @@ -58,7 +69,7 @@ struct column_sorted_order_fn {
thrust::greater<DeviceT>());
}
}
template <typename T, typename std::enable_if_t<!cudf::is_fixed_width<T>()>* = nullptr>
template <typename T, typename std::enable_if_t<!is_radix_sort_supported<T>()>* = nullptr>
void radix_sort(column_view const&, mutable_column_view&, bool, rmm::cuda_stream_view)
{
CUDF_FAIL("Only fixed-width types are suitable for faster sorting");
Expand All @@ -83,8 +94,8 @@ struct column_sorted_order_fn {
null_order null_precedence,
rmm::cuda_stream_view stream)
{
// column with nulls or non-fixed-width column will also use a comparator
if (input.has_nulls() || !cudf::is_fixed_width<T>()) {
// column with nulls or non-supported types will also use a comparator
if (input.has_nulls() || !is_radix_sort_supported<T>()) {
auto keys = column_device_view::create(input, stream);
thrust::sort(rmm::exec_policy(stream),
indices.begin<size_type>(),
Expand Down
22 changes: 22 additions & 0 deletions cpp/tests/table/row_operators_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,25 @@ TEST_F(RowOperatorTestForNAN, NANSorting)

CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got2->view());
}

TEST_F(RowOperatorTestForNAN, NANSortingNonNull)
{
cudf::test::fixed_width_column_wrapper<double> input{
{0.,
double(NAN),
-1.,
7.,
std::numeric_limits<double>::infinity(),
1.,
-1 * std::numeric_limits<double>::infinity()}};

cudf::table_view input_table{{input}};

auto result = cudf::sorted_order(input_table, {cudf::order::ASCENDING});
cudf::test::fixed_width_column_wrapper<int32_t> expected_asc{{6, 2, 0, 5, 3, 4, 1}};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_asc, result->view());

result = cudf::sorted_order(input_table, {cudf::order::DESCENDING});
cudf::test::fixed_width_column_wrapper<int32_t> expected_desc{{1, 4, 3, 5, 0, 2, 6}};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_desc, result->view());
}
6 changes: 5 additions & 1 deletion python/cudf/cudf/_fuzz_testing/parquet.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2021, NVIDIA CORPORATION.


import logging
Expand Down Expand Up @@ -96,6 +96,10 @@ def set_rand_params(self, params):
params_dict[param] = list(
np.unique(np.random.choice(self._df.columns, col_size))
)
elif param in ("skiprows", "num_rows"):
params_dict[param] = np.random.choice(
[None, self._rand(len(self._df))]
)
else:
params_dict[param] = np.random.choice(values)
self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
Expand Down
15 changes: 13 additions & 2 deletions python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2021, NVIDIA CORPORATION.

import sys

Expand Down Expand Up @@ -28,18 +28,29 @@ def parquet_reader_test(parquet_buffer):
params={
"columns": ALL_POSSIBLE_VALUES,
"use_pandas_metadata": [True, False],
"skiprows": ALL_POSSIBLE_VALUES,
"num_rows": ALL_POSSIBLE_VALUES,
},
)
def parquet_reader_columns(parquet_buffer, columns, use_pandas_metadata):
def parquet_reader_columns(
parquet_buffer, columns, use_pandas_metadata, skiprows, num_rows
):
pdf = pd.read_parquet(
parquet_buffer,
columns=columns,
use_pandas_metadata=use_pandas_metadata,
)

pdf = pdf.iloc[skiprows:]
if num_rows is not None:
pdf = pdf.head(num_rows)

gdf = cudf.read_parquet(
parquet_buffer,
columns=columns,
use_pandas_metadata=use_pandas_metadata,
skiprows=skiprows,
num_rows=num_rows,
)

compare_dataframe(gdf, pdf)
Expand Down
4 changes: 4 additions & 0 deletions python/cudf/cudf/_lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,16 @@
datetime,
filling,
gpuarrow,
groupby,
hash,
interop,
join,
json,
merge,
null_mask,
nvtext,
orc,
parquet,
partitioning,
quantiles,
reduce,
Expand All @@ -27,6 +30,7 @@
search,
sort,
stream_compaction,
string_casting,
strings,
table,
transpose,
Expand Down
124 changes: 124 additions & 0 deletions python/cudf/cudf/_lib/column.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from __future__ import annotations
from typing import Tuple, Union, TypeVar, Optional

from cudf._typing import DtypeObj, Dtype, ScalarLike
from cudf.core.buffer import Buffer
from cudf.core.column import ColumnBase


T = TypeVar("T")

class Column:
_data: Optional[Buffer]
_mask: Optional[Buffer]
_base_data: Optional[Buffer]
_base_mask: Optional[Buffer]
_dtype: DtypeObj
_offset: int
_null_count: int
_children: Tuple[ColumnBase, ...]
_base_children: Tuple[ColumnBase, ...]

def __init__(
self,
data: Optional[Buffer],
dtype: Dtype,
size: int = None,
mask: Optional[Buffer] = None,
offset: int = None,
null_count: int = None,
children: Tuple[ColumnBase, ...] = (),
) -> None:
...

@property
def base_size(self) -> int:
...

@property
def dtype(self) -> DtypeObj:
...

@property
def size(self) -> int:
...

@property
def base_data(self) -> Optional[Buffer]:
...

@property
def base_data_ptr(self) -> int:
...

@property
def data(self) -> Optional[Buffer]:
...

@property
def data_ptr(self) -> int:
...

def set_base_data(self, value: Buffer) -> None:
...

@property
def nullable(self) -> bool:
...

@property
def has_nulls(self) -> bool:
...

@property
def base_mask(self) -> Optional[Buffer]:
...

@property
def base_mask_ptr(self) -> int:
...

@property
def mask(self) -> Optional[Buffer]:
...

@property
def mask_ptr(self) -> int:
...

def set_base_mask(self, value: Optional[Buffer]) -> None:
...

def set_mask(self: T, value: Optional[Buffer]) -> T:
...

@property
def null_count(self) -> int:
...

@property
def offset(self) -> int:
...

@property
def base_children(self) -> Tuple[ColumnBase, ...]:
...

@property
def children(self) -> Tuple[ColumnBase, ...]:
...

def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None:
...

def _mimic_inplace(self, other_col: ColumnBase, inplace=False) -> Optional[ColumnBase]:
...

@staticmethod
def from_scalar(
val: ScalarLike,
size: int
) -> ColumnBase: # TODO: This should be Scalar, not ScalarLike
...
Loading

0 comments on commit 8f1f842

Please sign in to comment.