Merge branch 'branch-0.18' of github.com:rapidsai/cudf into avro-tests

rapidsai · Jan 27, 2021 · 8f1f842 · 8f1f842
2 parents 2e47499 + fc40c52
commit 8f1f842
Show file tree

Hide file tree

Showing 42 changed files with 1,712 additions and 904 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -32,6 +32,15 @@ repos:
                 language: system
                 files: \.(cu|cuh|h|hpp|cpp|inl)$
                 args: ['-fallback-style=none']
+      - repo: local
+        hooks:
+              - id: mypy
+                name: mypy
+                description: mypy
+                pass_filenames: false
+                entry: mypy --config-file=python/cudf/setup.cfg python/cudf/cudf
+                language: system
+                types: [python]
 
 default_language_version:
       python: python3
diff --git a/ci/checks/style.sh b/ci/checks/style.sh
@@ -29,6 +29,10 @@ FLAKE_RETVAL=$?
 FLAKE_CYTHON=`flake8 --config=python/.flake8.cython`
 FLAKE_CYTHON_RETVAL=$?
 
+# Run mypy and get results/return code
+MYPY_CUDF=`mypy --config=python/cudf/setup.cfg python/cudf/cudf`
+MYPY_CUDF_RETVAL=$?
+
 # Run clang-format and check for a consistent code format
 CLANG_FORMAT=`python cpp/scripts/run-clang-format.py 2>&1`
 CLANG_FORMAT_RETVAL=$?
@@ -66,6 +70,14 @@ else
   echo -e "\n\n>>>> PASSED: flake8-cython style check\n\n"
 fi
 
+if [ "$MYPY_CUDF_RETVAL" != "0" ]; then
+  echo -e "\n\n>>>> FAILED: mypy style check; begin output\n\n"
+  echo -e "$MYPY_CUDF"
+  echo -e "\n\n>>>> FAILED: mypy style check; end output\n\n"
+else
+  echo -e "\n\n>>>> PASSED: mypy style check\n\n"
+fi
+
 if [ "$CLANG_FORMAT_RETVAL" != "0" ]; then
   echo -e "\n\n>>>> FAILED: clang format check; begin output\n\n"
   echo -e "$CLANG_FORMAT"
@@ -79,7 +91,7 @@ HEADER_META=`ci/checks/headers_test.sh`
 HEADER_META_RETVAL=$?
 echo -e "$HEADER_META"
 
-RETVALS=($ISORT_RETVAL $BLACK_RETVAL $FLAKE_RETVAL $FLAKE_CYTHON_RETVAL $CLANG_FORMAT_RETVAL $HEADER_META_RETVAL)
+RETVALS=($ISORT_RETVAL $BLACK_RETVAL $FLAKE_RETVAL $FLAKE_CYTHON_RETVAL $CLANG_FORMAT_RETVAL $HEADER_META_RETVAL $MYPY_CUDF_RETVAL)
 IFS=$'\n'
 RETVAL=`echo "${RETVALS[*]}" | sort -nr | head -n1`
 

diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml
@@ -40,6 +40,8 @@ dependencies:
   - flake8=3.8.3
   - black=19.10
   - isort=5.0.7
+  - mypy=0.782
+  - typing_extensions
   - pre_commit
   - dask>=2.22.0
   - distributed>=2.22.0

diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml
@@ -40,6 +40,8 @@ dependencies:
   - flake8=3.8.3
   - black=19.10
   - isort=5.0.7
+  - mypy=0.782
+  - typing_extensions
   - pre_commit
   - dask>=2.22.0
   - distributed>=2.22.0

diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
@@ -40,6 +40,8 @@ dependencies:
   - flake8=3.8.3
   - black=19.10
   - isort=5.0.7
+  - mypy=0.782
+  - typing_extensions
   - pre_commit
   - dask>=2.22.0
   - distributed>=2.22.0

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -34,6 +34,7 @@ requirements:
   run:
     - protobuf
     - python
+    - typing_extensions
     - pandas >=1.0,<1.2.0dev0
     - cupy >7.1.0,<9.0.0a0
     - numba >=0.49.0

diff --git a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
@@ -63,9 +63,9 @@ void BM_parq_write_varying_inout(benchmark::State& state)
 
 void BM_parq_write_varying_options(benchmark::State& state)
 {
-  auto const compression     = static_cast<cudf::io::compression_type>(state.range(0));
-  auto const enable_stats    = static_cast<cudf::io::statistics_freq>(state.range(1));
-  auto const output_metadata = state.range(2) != 0;
+  auto const compression  = static_cast<cudf::io::compression_type>(state.range(0));
+  auto const enable_stats = static_cast<cudf::io::statistics_freq>(state.range(1));
+  auto const file_path    = state.range(2) != 0 ? "unused_path.parquet" : "";
 
   auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
                                              int32_t(type_group_id::FLOATING_POINT),
@@ -82,8 +82,7 @@ void BM_parq_write_varying_options(benchmark::State& state)
       cudf_io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
         .compression(compression)
         .stats_level(enable_stats)
-        .return_filemetadata(output_metadata)
-        .column_chunks_file_path("dummy_path.parquet");
+        .column_chunks_file_path(file_path);
     cudf_io::write_parquet(options);
   }
 

diff --git a/cpp/src/sort/sort_column.cu b/cpp/src/sort/sort_column.cu
@@ -24,6 +24,17 @@ namespace {
  * @brief Type-dispatched functor for sorting a single column.
  */
 struct column_sorted_order_fn {
+  /**
+   * @brief Compile time check for allowing radix sort for column type.
+   *
+   * Floating point is removed here for special handling of NaNs.
+   */
+  template <typename T>
+  static constexpr bool is_radix_sort_supported()
+  {
+    return cudf::is_fixed_width<T>() && !cudf::is_floating_point<T>();
+  }
+
   /**
    * @brief Sorts fixed-width columns using faster thrust sort.
    *
@@ -32,15 +43,15 @@ struct column_sorted_order_fn {
    * @param ascending True if sort order is ascending
    * @param stream CUDA stream used for device memory operations and kernel launches
    */
-  template <typename T, typename std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
+  template <typename T, typename std::enable_if_t<is_radix_sort_supported<T>()>* = nullptr>
   void radix_sort(column_view const& input,
                   mutable_column_view& indices,
                   bool ascending,
                   rmm::cuda_stream_view stream)
   {
-    // A non-stable sort on a fixed-width column with no nulls will use a radix sort
-    // if using only the thrust::less or thrust::greater comparators but also
-    // requires making a copy of the input data.
+    // A non-stable sort on a column of arithmetic type with no nulls will use a radix sort
+    // if specifying only the `thrust::less` or `thrust::greater` comparators.
+    // But this also requires making a copy of the input data.
     auto temp_col = column(input, stream);
     auto d_col    = temp_col.mutable_view();
     using DeviceT = device_storage_type_t<T>;
@@ -58,7 +69,7 @@ struct column_sorted_order_fn {
                           thrust::greater<DeviceT>());
     }
   }
-  template <typename T, typename std::enable_if_t<!cudf::is_fixed_width<T>()>* = nullptr>
+  template <typename T, typename std::enable_if_t<!is_radix_sort_supported<T>()>* = nullptr>
   void radix_sort(column_view const&, mutable_column_view&, bool, rmm::cuda_stream_view)
   {
     CUDF_FAIL("Only fixed-width types are suitable for faster sorting");
@@ -83,8 +94,8 @@ struct column_sorted_order_fn {
                   null_order null_precedence,
                   rmm::cuda_stream_view stream)
   {
-    // column with nulls or non-fixed-width column will also use a comparator
-    if (input.has_nulls() || !cudf::is_fixed_width<T>()) {
+    // column with nulls or non-supported types will also use a comparator
+    if (input.has_nulls() || !is_radix_sort_supported<T>()) {
       auto keys = column_device_view::create(input, stream);
       thrust::sort(rmm::exec_policy(stream),
                    indices.begin<size_type>(),

diff --git a/cpp/tests/table/row_operators_tests.cpp b/cpp/tests/table/row_operators_tests.cpp
@@ -65,3 +65,25 @@ TEST_F(RowOperatorTestForNAN, NANSorting)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got2->view());
 }
+
+TEST_F(RowOperatorTestForNAN, NANSortingNonNull)
+{
+  cudf::test::fixed_width_column_wrapper<double> input{
+    {0.,
+     double(NAN),
+     -1.,
+     7.,
+     std::numeric_limits<double>::infinity(),
+     1.,
+     -1 * std::numeric_limits<double>::infinity()}};
+
+  cudf::table_view input_table{{input}};
+
+  auto result = cudf::sorted_order(input_table, {cudf::order::ASCENDING});
+  cudf::test::fixed_width_column_wrapper<int32_t> expected_asc{{6, 2, 0, 5, 3, 4, 1}};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_asc, result->view());
+
+  result = cudf::sorted_order(input_table, {cudf::order::DESCENDING});
+  cudf::test::fixed_width_column_wrapper<int32_t> expected_desc{{1, 4, 3, 5, 0, 2, 6}};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_desc, result->view());
+}
diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 
 import logging
@@ -96,6 +96,10 @@ def set_rand_params(self, params):
                 params_dict[param] = list(
                     np.unique(np.random.choice(self._df.columns, col_size))
                 )
+            elif param in ("skiprows", "num_rows"):
+                params_dict[param] = np.random.choice(
+                    [None, self._rand(len(self._df))]
+                )
             else:
                 params_dict[param] = np.random.choice(values)
         self._current_params["test_kwargs"] = self.process_kwargs(params_dict)

diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 import sys
 
@@ -28,18 +28,29 @@ def parquet_reader_test(parquet_buffer):
     params={
         "columns": ALL_POSSIBLE_VALUES,
         "use_pandas_metadata": [True, False],
+        "skiprows": ALL_POSSIBLE_VALUES,
+        "num_rows": ALL_POSSIBLE_VALUES,
     },
 )
-def parquet_reader_columns(parquet_buffer, columns, use_pandas_metadata):
+def parquet_reader_columns(
+    parquet_buffer, columns, use_pandas_metadata, skiprows, num_rows
+):
     pdf = pd.read_parquet(
         parquet_buffer,
         columns=columns,
         use_pandas_metadata=use_pandas_metadata,
     )
+
+    pdf = pdf.iloc[skiprows:]
+    if num_rows is not None:
+        pdf = pdf.head(num_rows)
+
     gdf = cudf.read_parquet(
         parquet_buffer,
         columns=columns,
         use_pandas_metadata=use_pandas_metadata,
+        skiprows=skiprows,
+        num_rows=num_rows,
     )
 
     compare_dataframe(gdf, pdf)

diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
@@ -10,13 +10,16 @@
     datetime,
     filling,
     gpuarrow,
+    groupby,
     hash,
     interop,
     join,
+    json,
     merge,
     null_mask,
     nvtext,
     orc,
+    parquet,
     partitioning,
     quantiles,
     reduce,
@@ -27,6 +30,7 @@
     search,
     sort,
     stream_compaction,
+    string_casting,
     strings,
     table,
     transpose,

diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi
@@ -0,0 +1,124 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from __future__ import annotations
+from typing import Tuple, Union, TypeVar, Optional
+
+from cudf._typing import DtypeObj, Dtype, ScalarLike
+from cudf.core.buffer import Buffer
+from cudf.core.column import ColumnBase
+
+
+T = TypeVar("T")
+
+class Column:
+    _data: Optional[Buffer]
+    _mask: Optional[Buffer]
+    _base_data: Optional[Buffer]
+    _base_mask: Optional[Buffer]
+    _dtype: DtypeObj
+    _offset: int
+    _null_count: int
+    _children: Tuple[ColumnBase, ...]
+    _base_children: Tuple[ColumnBase, ...]
+
+    def __init__(
+        self,
+        data: Optional[Buffer],
+        dtype: Dtype,
+        size: int = None,
+        mask: Optional[Buffer] = None,
+        offset: int = None,
+        null_count: int = None,
+        children: Tuple[ColumnBase, ...] = (),
+    ) -> None:
+        ...
+
+    @property
+    def base_size(self) -> int:
+        ...
+
+    @property
+    def dtype(self) -> DtypeObj:
+        ...
+
+    @property
+    def size(self) -> int:
+        ...
+
+    @property
+    def base_data(self) -> Optional[Buffer]:
+        ...
+
+    @property
+    def base_data_ptr(self) -> int:
+        ...
+
+    @property
+    def data(self) -> Optional[Buffer]:
+        ...
+
+    @property
+    def data_ptr(self) -> int:
+        ...
+
+    def set_base_data(self, value: Buffer) -> None:
+        ...
+
+    @property
+    def nullable(self) -> bool:
+        ...
+
+    @property
+    def has_nulls(self) -> bool:
+        ...
+
+    @property
+    def base_mask(self) -> Optional[Buffer]:
+        ...
+
+    @property
+    def base_mask_ptr(self) -> int:
+        ...
+
+    @property
+    def mask(self) -> Optional[Buffer]:
+        ...
+
+    @property
+    def mask_ptr(self) -> int:
+        ...
+
+    def set_base_mask(self, value: Optional[Buffer]) -> None:
+        ...
+
+    def set_mask(self: T, value: Optional[Buffer]) -> T:
+        ...
+
+    @property
+    def null_count(self) -> int:
+        ...
+
+    @property
+    def offset(self) -> int:
+        ...
+
+    @property
+    def base_children(self) -> Tuple[ColumnBase, ...]:
+        ...
+
+    @property
+    def children(self) -> Tuple[ColumnBase, ...]:
+        ...
+
+    def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None:
+        ...
+
+    def _mimic_inplace(self, other_col: ColumnBase, inplace=False) -> Optional[ColumnBase]:
+        ...
+
+    @staticmethod
+    def from_scalar(
+        val: ScalarLike,
+        size: int
+    ) -> ColumnBase:  # TODO: This should be Scalar, not ScalarLike
+        ...