Merge pull request #8 from rapidsai/branch-0.10

Changes
rapidsai · Sep 4, 2019 · 5f78259 · 5f78259
2 parents 05aa735 + a6f891b
commit 5f78259
Show file tree

Hide file tree

Showing 20 changed files with 373 additions and 65 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,7 +6,9 @@
 - PR #2607 Add Java bindings for parsing JSON
 - PR #2629 Add dropna= parameter to groupby
 - PR #2585 ORC & Parquet Readers: Remove millisecond timestamp restriction
+- PR #2559 Add Series.tolist()
 - PR #2653 Add Java bindings for rolling window operations
+- PR #2674 Add __contains__ for Index/Series/Column
 
 ## Improvements
 
@@ -16,7 +18,8 @@
 - PR #2648 Cython/Python reorg
 - PR #2588 Update Series.append documentation
 - PR #2632 Replace dask-cudf set_index code with upstream
-
+- PR #2673 Add support for np.longlong type
+- PR #2703 move dask serialization dispatch into cudf
 
 ## Bug Fixes
 
@@ -37,6 +40,10 @@
 - PR #2669 AVRO reader: fix non-deterministic output
 - PR #2668 Update Java bindings to specify timestamp units for ORC and Parquet readers
 - PR #2679 AVRO reader: fix cuda errors when decoding compressed streams
+- PR #2651 Remove nvidia driver installation from ci/cpu/build.sh
+- PR #2697 Ensure csv reader sets datetime column time units
+- PR #2698 Return RangeIndex from contiguous slice of RangeIndex
+- PR #2672 Fix null and integer handling in round
 
 
 # cuDF 0.9.0 (Date TBD)

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
@@ -43,21 +43,6 @@ conda list
 # FIX Added to deal with Anancoda SSL verification issues during conda builds
 conda config --set ssl_verify False
 
-################################################################################
-# INSTALL - Install NVIDIA driver
-################################################################################
-
-logger "Install NVIDIA driver for CUDA $CUDA..."
-apt-get update -q
-DRIVER_VER="396.44-1"
-LIBCUDA_VER="396"
-if [ "$CUDA" == "10.0" ]; then
-  DRIVER_VER="410.72-1"
-  LIBCUDA_VER="410"
-fi
-DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-  cuda-drivers=${DRIVER_VER} libcuda1-${LIBCUDA_VER}
-
 ################################################################################
 # BUILD - Conda package builds (conda deps: libcudf <- libcudf_cffi <- cudf)
 ################################################################################

diff --git a/cpp/src/io/csv/csv_reader_impl.cu b/cpp/src/io/csv/csv_reader_impl.cu
@@ -630,8 +630,12 @@ table reader::Impl::read()
   std::vector<gdf_column_wrapper> columns;
   for (int col = 0, active_col = 0; col < num_actual_cols; ++col) {
     if (h_column_flags[col] & column_parse::enabled) {
+      auto time_unit = TIME_UNIT_NONE;
+      if (dtypes[active_col] == GDF_DATE64 || dtypes[active_col] == GDF_TIMESTAMP) {
+        time_unit = TIME_UNIT_ms;
+      }
       columns.emplace_back(num_records, dtypes[active_col],
-                           gdf_dtype_extra_info{TIME_UNIT_NONE},
+                           gdf_dtype_extra_info{time_unit},
                            col_names[col]);
       CUDF_EXPECTS(columns.back().allocate() == GDF_SUCCESS, "Cannot allocate columns");
       active_col++;

diff --git a/cpp/tests/io/csv/csv_test.cu b/cpp/tests/io/csv/csv_test.cu
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/cudf.h>
+#include <cudf/unary.hpp>
 #include <nvstrings/NVStrings.h>
 
 #include <gtest/gtest.h>
@@ -306,6 +307,38 @@ TEST(gdf_csv_test, Dates)
     }
 }
 
+TEST(gdf_csv_test, Timestamps)
+{
+    const std::string fname = temp_env->get_temp_dir()+"CsvTimestamps.csv";
+
+    std::ofstream outfile(fname, std::ofstream::out);
+    outfile << "true,334.0,2014-02-01T12:30:23.000-06:00\n";
+    outfile.close();
+    ASSERT_TRUE( checkFile(fname) );
+
+    {
+        cudf::csv_read_arg args(cudf::source_info{fname});
+        args.names = { "A" };
+        args.dtype = { "timestamp" };
+        args.dayfirst = true;
+        args.header = -1;
+        const auto df = cudf::read_csv(args);
+
+        EXPECT_EQ( df.num_columns(), static_cast<int>(args.names.size()) );
+        ASSERT_EQ( df.get_column(0)->dtype, GDF_TIMESTAMP );
+        ASSERT_EQ( df.get_column(0)->dtype_info.time_unit, TIME_UNIT_ms );
+        auto ACol = gdf_host_column<uint64_t>(df.get_column(0));
+        std::cerr << "Time Unit= " << df.get_column(0)->dtype_info.time_unit;
+
+        gdf_column output;
+        gdf_dtype_extra_info info{};
+        info.time_unit = TIME_UNIT_us;
+        output = cudf::cast(*df.get_column(0), GDF_TIMESTAMP, info);
+        ASSERT_EQ( output.dtype, GDF_TIMESTAMP );
+        ASSERT_EQ( output.dtype_info.time_unit, TIME_UNIT_us );
+    }
+}
+
 TEST(gdf_csv_test, FloatingPoint)
 {
     const std::string fname = temp_env->get_temp_dir()+"CsvFloatingPoint.csv";

diff --git a/python/cudf/cudf/_lib/cudf.pyx b/python/cudf/cudf/_lib/cudf.pyx
@@ -27,6 +27,7 @@ dtypes = {
     np.float64: GDF_FLOAT64,
     np.float32: GDF_FLOAT32,
     np.int64: GDF_INT64,
+    np.longlong: GDF_INT64,
     np.int32: GDF_INT32,
     np.int16: GDF_INT16,
     np.int8: GDF_INT8,
@@ -221,7 +222,7 @@ cdef set_scalar_value(gdf_scalar *scalar, val):
         scalar.data.fp64 = val
     elif val.dtype.type == np.float32:
         scalar.data.fp32 = val
-    elif val.dtype.type == np.int64:
+    elif val.dtype.type == np.int64 or val.dtype.type == np.longlong:
         scalar.data.si64 = val
     elif val.dtype.type == np.int32:
         scalar.data.si32 = val

diff --git a/python/cudf/cudf/comm/serialize.py b/python/cudf/cudf/comm/serialize.py
@@ -1,23 +1,44 @@
-import functools
+import pickle
 
+import cudf
+import cudf.core.groupby.groupby
 
-def register_distributed_serializer(cls):
-    try:
-        from distributed.protocol.cuda import cuda_serialize, cuda_deserialize
-        from distributed.protocol import serialize, deserialize
+try:
+    from distributed.protocol.cuda import cuda_deserialize, cuda_serialize
+    from distributed.utils import log_errors
 
-        serialize_part = functools.partial(
-            serialize, serializers=["cuda", "dask", "pickle"]
-        )
-        deserialize_part = functools.partial(
-            deserialize, deserializers=["cuda", "dask", "pickle"]
+    # all (de-)serializtion are attached to cudf Objects:
+    # Series/DataFrame/Index/Column/Buffer/etc
+    @cuda_serialize.register(
+        (
+            cudf.DataFrame,
+            cudf.Series,
+            cudf.core.series.Series,
+            cudf.core.groupby.groupby._Groupby,
+            cudf.core.column.column.Column,
         )
+    )
+    def serialize_cudf_dataframe(x):
+        with log_errors():
+            header, frames = x.serialize()
+            return header, frames
 
-        cuda_serialize.register(cls)(
-            functools.partial(cls.serialize, serialize=serialize_part)
-        )
-        cuda_deserialize.register(cls)(
-            functools.partial(cls.deserialize, deserialize_part)
+    @cuda_deserialize.register(
+        (
+            cudf.DataFrame,
+            cudf.Series,
+            cudf.core.series.Series,
+            cudf.core.groupby.groupby._Groupby,
+            cudf.core.column.column.Column,
         )
-    except ImportError:
-        pass
+    )
+    def deserialize_cudf_dataframe(header, frames):
+        with log_errors():
+            cudf_typ = pickle.loads(header["type"])
+            cudf_obj = cudf_typ.deserialize(header, frames)
+            return cudf_obj
+
+
+except ImportError:
+    # distributed is probably not installed on the system
+    pass
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
@@ -238,6 +238,9 @@ def __init__(self, **kwargs):
         self._categories = categories
         self._ordered = ordered
 
+    def __contains__(self, item):
+        return self._encode(item) in self.as_numerical
+
     def serialize(self):
         header, frames = super(CategoricalColumn, self).serialize()
         header["ordered"] = self._ordered

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
@@ -42,6 +42,14 @@ def __init__(self, **kwargs):
         assert self.dtype.type is np.datetime64
         self._time_unit, _ = np.datetime_data(self.dtype)
 
+    def __contains__(self, item):
+        # Handles improper item types
+        try:
+            item = np.datetime64(item, self._time_unit)
+        except Exception:
+            return False
+        return item.astype("int_") in self.as_numerical
+
     def serialize(self):
         header, frames = super(DatetimeColumn, self).serialize()
         header["type"] = pickle.dumps(type(self))

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
@@ -44,14 +44,25 @@ def __contains__(self, item):
         """
         Returns True if column contains item, else False.
         """
-        item_found = False
+        # Handles improper item types
+        # Fails if item is of type None, so the handler.
         try:
-            if self.find_first_value(item):
-                item_found = True
-        except ValueError:
-            """This means value not found"""
-
-        return item_found
+            if np.can_cast(item, self.data.mem.dtype):
+                item = self.data.mem.dtype.type(item)
+            else:
+                return False
+        except Exception:
+            return False
+        # Issue with cudautils with bool araray, always returns True.
+        if self.data.mem.dtype == np.bool:
+            return (
+                cudautils.find_first(
+                    self.data.mem.view("int8"), item.view("int8")
+                )
+                != -1
+            )
+        else:
+            return cudautils.find_first(self.data.mem, item) != -1
 
     def replace(self, **kwargs):
         if "data" in kwargs and "dtype" not in kwargs:
@@ -86,10 +97,10 @@ def binary_operator(self, binop, rhs, reflect=False):
         if isinstance(rhs, NumericalColumn) or np.isscalar(rhs):
             out_dtype = np.result_type(self.dtype, rhs.dtype)
             if binop in ["mod", "floordiv"]:
-                if (
+                if (tmp.dtype in int_dtypes) and (
                     (np.isscalar(tmp) and (0 == tmp))
                     or ((isinstance(tmp, NumericalColumn)) and (0.0 in tmp))
-                ) and (tmp.dtype in int_dtypes):
+                ):
                     out_dtype = np.dtype("float_")
             return _numeric_column_binop(
                 lhs=self,
@@ -254,6 +265,13 @@ def sum_of_squares(self, dtype=None):
         return libcudf.reduce.reduce("sum_of_squares", self, dtype=dtype)
 
     def round(self, decimals=0):
+        if decimals < 0:
+            msg = "Decimal values < 0 are not yet supported."
+            raise NotImplementedError(msg)
+
+        if np.issubdtype(self.dtype, np.integer):
+            return self
+
         data = Buffer(cudautils.apply_round(self.data.mem, decimals))
         return self.replace(data=data)
 

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
@@ -463,6 +463,9 @@ def __init__(self, data, null_count=None, name=None, **kwargs):
         self._nvcategory = None
         self._indices = None
 
+    def __contains__(self, item):
+        return True in self.str().contains(f"^{item}$")._column
+
     def __reduce__(self):
         cpumem = self.to_arrow()
         return column.as_column, (cpumem, False, np.dtype("object"))

diff --git a/python/cudf/cudf/core/groupby/legacy_groupby.py b/python/cudf/cudf/core/groupby/legacy_groupby.py
@@ -10,7 +10,6 @@
 
 import cudf
 import cudf._lib as libcudf
-from cudf.comm.serialize import register_distributed_serializer
 from cudf.core.series import Series
 
 
@@ -530,6 +529,3 @@ def rolling_avg(val, avg):
         df, segs = self.as_df()
         kwargs.update({"chunks": segs})
         return df.apply_chunks(function, **kwargs)
-
-
-register_distributed_serializer(Groupby)
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
@@ -45,6 +45,9 @@ def serialize(self):
         header["frame_count"] = len(frames)
         return header, frames
 
+    def __contains__(self, item):
+        return item in self._values
+
     @classmethod
     def deserialize(cls, header, frames):
         """
@@ -383,6 +386,12 @@ def __init__(self, start, stop=None, name=None):
         self.name = name
         self._cached_values = None
 
+    def __contains__(self, item):
+        if self._start <= item < self._stop:
+            return True
+        else:
+            return False
+
     def copy(self, deep=True):
         if deep:
             result = deepcopy(self)
@@ -418,6 +427,8 @@ def __getitem__(self, index):
             stop += self._start
             if sln == 0:
                 return RangeIndex(0)
+            elif step == 1:
+                return RangeIndex(start, stop)
             else:
                 return index_from_range(start, stop, step)
 
@@ -550,8 +561,15 @@ def is_monotonic_decreasing(self):
         return self._start >= self._stop
 
     def get_slice_bound(self, label, side, kind):
-        # TODO: Range-specific implementation here
-        raise (NotImplementedError)
+        if label < self._start:
+            return 0
+        elif label >= self._stop:
+            return len(self)
+        else:
+            if side == "left":
+                return label - self._start
+            elif side == "right":
+                return (label - self._start) + 1
 
     @property
     def __cuda_array_interface__(self):

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
@@ -226,7 +226,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length):
             for idx, row in enumerate(row_tuple):
                 if row == slice(None):
                     continue
-                if row not in index.levels[idx]:
+                if row not in index.levels[idx]._column:
                     raise KeyError(row)
         return result
 

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -128,6 +128,9 @@ def __init__(
         self._index = RangeIndex(len(data)) if index is None else index
         self._name = name
 
+    def __contains__(self, item):
+        return item in self._index
+
     @classmethod
     def from_pandas(cls, s, nan_as_null=True):
         return cls(s, nan_as_null=nan_as_null)
@@ -446,6 +449,16 @@ def values_to_string(self, nrows=None):
             out = ["" if v is None else str(v) for v in values]
         return out
 
+    def tolist(self):
+        """
+        Return a list type from series data.
+
+        Returns
+        -------
+        list
+        """
+        return self.to_arrow().to_pylist()
+
     def head(self, n=5):
         return self.iloc[:n]
 
@@ -1815,6 +1828,7 @@ def round(self, decimals=0):
             self._column.round(decimals=decimals),
             name=self.name,
             index=self.index,
+            dtype=self.dtype,
         )
 
     def isin(self, test):