Skip to content

Commit

Permalink
Merge pull request #8 from rapidsai/branch-0.10
Browse files Browse the repository at this point in the history
Changes
  • Loading branch information
rgsl888prabhu authored Sep 4, 2019
2 parents 05aa735 + a6f891b commit 5f78259
Show file tree
Hide file tree
Showing 20 changed files with 373 additions and 65 deletions.
9 changes: 8 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
- PR #2607 Add Java bindings for parsing JSON
- PR #2629 Add dropna= parameter to groupby
- PR #2585 ORC & Parquet Readers: Remove millisecond timestamp restriction
- PR #2559 Add Series.tolist()
- PR #2653 Add Java bindings for rolling window operations
- PR #2674 Add __contains__ for Index/Series/Column

## Improvements

Expand All @@ -16,7 +18,8 @@
- PR #2648 Cython/Python reorg
- PR #2588 Update Series.append documentation
- PR #2632 Replace dask-cudf set_index code with upstream

- PR #2673 Add support for np.longlong type
- PR #2703 move dask serialization dispatch into cudf

## Bug Fixes

Expand All @@ -37,6 +40,10 @@
- PR #2669 AVRO reader: fix non-deterministic output
- PR #2668 Update Java bindings to specify timestamp units for ORC and Parquet readers
- PR #2679 AVRO reader: fix cuda errors when decoding compressed streams
- PR #2651 Remove nvidia driver installation from ci/cpu/build.sh
- PR #2697 Ensure csv reader sets datetime column time units
- PR #2698 Return RangeIndex from contiguous slice of RangeIndex
- PR #2672 Fix null and integer handling in round


# cuDF 0.9.0 (Date TBD)
Expand Down
15 changes: 0 additions & 15 deletions ci/cpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,21 +43,6 @@ conda list
# FIX Added to deal with Anancoda SSL verification issues during conda builds
conda config --set ssl_verify False

################################################################################
# INSTALL - Install NVIDIA driver
################################################################################

logger "Install NVIDIA driver for CUDA $CUDA..."
apt-get update -q
DRIVER_VER="396.44-1"
LIBCUDA_VER="396"
if [ "$CUDA" == "10.0" ]; then
DRIVER_VER="410.72-1"
LIBCUDA_VER="410"
fi
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
cuda-drivers=${DRIVER_VER} libcuda1-${LIBCUDA_VER}

################################################################################
# BUILD - Conda package builds (conda deps: libcudf <- libcudf_cffi <- cudf)
################################################################################
Expand Down
6 changes: 5 additions & 1 deletion cpp/src/io/csv/csv_reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -630,8 +630,12 @@ table reader::Impl::read()
std::vector<gdf_column_wrapper> columns;
for (int col = 0, active_col = 0; col < num_actual_cols; ++col) {
if (h_column_flags[col] & column_parse::enabled) {
auto time_unit = TIME_UNIT_NONE;
if (dtypes[active_col] == GDF_DATE64 || dtypes[active_col] == GDF_TIMESTAMP) {
time_unit = TIME_UNIT_ms;
}
columns.emplace_back(num_records, dtypes[active_col],
gdf_dtype_extra_info{TIME_UNIT_NONE},
gdf_dtype_extra_info{time_unit},
col_names[col]);
CUDF_EXPECTS(columns.back().allocate() == GDF_SUCCESS, "Cannot allocate columns");
active_col++;
Expand Down
33 changes: 33 additions & 0 deletions cpp/tests/io/csv/csv_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include <cudf/cudf.h>
#include <cudf/unary.hpp>
#include <nvstrings/NVStrings.h>

#include <gtest/gtest.h>
Expand Down Expand Up @@ -306,6 +307,38 @@ TEST(gdf_csv_test, Dates)
}
}

TEST(gdf_csv_test, Timestamps)
{
const std::string fname = temp_env->get_temp_dir()+"CsvTimestamps.csv";

std::ofstream outfile(fname, std::ofstream::out);
outfile << "true,334.0,2014-02-01T12:30:23.000-06:00\n";
outfile.close();
ASSERT_TRUE( checkFile(fname) );

{
cudf::csv_read_arg args(cudf::source_info{fname});
args.names = { "A" };
args.dtype = { "timestamp" };
args.dayfirst = true;
args.header = -1;
const auto df = cudf::read_csv(args);

EXPECT_EQ( df.num_columns(), static_cast<int>(args.names.size()) );
ASSERT_EQ( df.get_column(0)->dtype, GDF_TIMESTAMP );
ASSERT_EQ( df.get_column(0)->dtype_info.time_unit, TIME_UNIT_ms );
auto ACol = gdf_host_column<uint64_t>(df.get_column(0));
std::cerr << "Time Unit= " << df.get_column(0)->dtype_info.time_unit;

gdf_column output;
gdf_dtype_extra_info info{};
info.time_unit = TIME_UNIT_us;
output = cudf::cast(*df.get_column(0), GDF_TIMESTAMP, info);
ASSERT_EQ( output.dtype, GDF_TIMESTAMP );
ASSERT_EQ( output.dtype_info.time_unit, TIME_UNIT_us );
}
}

TEST(gdf_csv_test, FloatingPoint)
{
const std::string fname = temp_env->get_temp_dir()+"CsvFloatingPoint.csv";
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/_lib/cudf.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ dtypes = {
np.float64: GDF_FLOAT64,
np.float32: GDF_FLOAT32,
np.int64: GDF_INT64,
np.longlong: GDF_INT64,
np.int32: GDF_INT32,
np.int16: GDF_INT16,
np.int8: GDF_INT8,
Expand Down Expand Up @@ -221,7 +222,7 @@ cdef set_scalar_value(gdf_scalar *scalar, val):
scalar.data.fp64 = val
elif val.dtype.type == np.float32:
scalar.data.fp32 = val
elif val.dtype.type == np.int64:
elif val.dtype.type == np.int64 or val.dtype.type == np.longlong:
scalar.data.si64 = val
elif val.dtype.type == np.int32:
scalar.data.si32 = val
Expand Down
55 changes: 38 additions & 17 deletions python/cudf/cudf/comm/serialize.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,44 @@
import functools
import pickle

import cudf
import cudf.core.groupby.groupby

def register_distributed_serializer(cls):
try:
from distributed.protocol.cuda import cuda_serialize, cuda_deserialize
from distributed.protocol import serialize, deserialize
try:
from distributed.protocol.cuda import cuda_deserialize, cuda_serialize
from distributed.utils import log_errors

serialize_part = functools.partial(
serialize, serializers=["cuda", "dask", "pickle"]
)
deserialize_part = functools.partial(
deserialize, deserializers=["cuda", "dask", "pickle"]
# all (de-)serializtion are attached to cudf Objects:
# Series/DataFrame/Index/Column/Buffer/etc
@cuda_serialize.register(
(
cudf.DataFrame,
cudf.Series,
cudf.core.series.Series,
cudf.core.groupby.groupby._Groupby,
cudf.core.column.column.Column,
)
)
def serialize_cudf_dataframe(x):
with log_errors():
header, frames = x.serialize()
return header, frames

cuda_serialize.register(cls)(
functools.partial(cls.serialize, serialize=serialize_part)
)
cuda_deserialize.register(cls)(
functools.partial(cls.deserialize, deserialize_part)
@cuda_deserialize.register(
(
cudf.DataFrame,
cudf.Series,
cudf.core.series.Series,
cudf.core.groupby.groupby._Groupby,
cudf.core.column.column.Column,
)
except ImportError:
pass
)
def deserialize_cudf_dataframe(header, frames):
with log_errors():
cudf_typ = pickle.loads(header["type"])
cudf_obj = cudf_typ.deserialize(header, frames)
return cudf_obj


except ImportError:
# distributed is probably not installed on the system
pass
3 changes: 3 additions & 0 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,9 @@ def __init__(self, **kwargs):
self._categories = categories
self._ordered = ordered

def __contains__(self, item):
return self._encode(item) in self.as_numerical

def serialize(self):
header, frames = super(CategoricalColumn, self).serialize()
header["ordered"] = self._ordered
Expand Down
8 changes: 8 additions & 0 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ def __init__(self, **kwargs):
assert self.dtype.type is np.datetime64
self._time_unit, _ = np.datetime_data(self.dtype)

def __contains__(self, item):
# Handles improper item types
try:
item = np.datetime64(item, self._time_unit)
except Exception:
return False
return item.astype("int_") in self.as_numerical

def serialize(self):
header, frames = super(DatetimeColumn, self).serialize()
header["type"] = pickle.dumps(type(self))
Expand Down
36 changes: 27 additions & 9 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,25 @@ def __contains__(self, item):
"""
Returns True if column contains item, else False.
"""
item_found = False
# Handles improper item types
# Fails if item is of type None, so the handler.
try:
if self.find_first_value(item):
item_found = True
except ValueError:
"""This means value not found"""

return item_found
if np.can_cast(item, self.data.mem.dtype):
item = self.data.mem.dtype.type(item)
else:
return False
except Exception:
return False
# Issue with cudautils with bool araray, always returns True.
if self.data.mem.dtype == np.bool:
return (
cudautils.find_first(
self.data.mem.view("int8"), item.view("int8")
)
!= -1
)
else:
return cudautils.find_first(self.data.mem, item) != -1

def replace(self, **kwargs):
if "data" in kwargs and "dtype" not in kwargs:
Expand Down Expand Up @@ -86,10 +97,10 @@ def binary_operator(self, binop, rhs, reflect=False):
if isinstance(rhs, NumericalColumn) or np.isscalar(rhs):
out_dtype = np.result_type(self.dtype, rhs.dtype)
if binop in ["mod", "floordiv"]:
if (
if (tmp.dtype in int_dtypes) and (
(np.isscalar(tmp) and (0 == tmp))
or ((isinstance(tmp, NumericalColumn)) and (0.0 in tmp))
) and (tmp.dtype in int_dtypes):
):
out_dtype = np.dtype("float_")
return _numeric_column_binop(
lhs=self,
Expand Down Expand Up @@ -254,6 +265,13 @@ def sum_of_squares(self, dtype=None):
return libcudf.reduce.reduce("sum_of_squares", self, dtype=dtype)

def round(self, decimals=0):
if decimals < 0:
msg = "Decimal values < 0 are not yet supported."
raise NotImplementedError(msg)

if np.issubdtype(self.dtype, np.integer):
return self

data = Buffer(cudautils.apply_round(self.data.mem, decimals))
return self.replace(data=data)

Expand Down
3 changes: 3 additions & 0 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,9 @@ def __init__(self, data, null_count=None, name=None, **kwargs):
self._nvcategory = None
self._indices = None

def __contains__(self, item):
return True in self.str().contains(f"^{item}$")._column

def __reduce__(self):
cpumem = self.to_arrow()
return column.as_column, (cpumem, False, np.dtype("object"))
Expand Down
4 changes: 0 additions & 4 deletions python/cudf/cudf/core/groupby/legacy_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

import cudf
import cudf._lib as libcudf
from cudf.comm.serialize import register_distributed_serializer
from cudf.core.series import Series


Expand Down Expand Up @@ -530,6 +529,3 @@ def rolling_avg(val, avg):
df, segs = self.as_df()
kwargs.update({"chunks": segs})
return df.apply_chunks(function, **kwargs)


register_distributed_serializer(Groupby)
22 changes: 20 additions & 2 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ def serialize(self):
header["frame_count"] = len(frames)
return header, frames

def __contains__(self, item):
return item in self._values

@classmethod
def deserialize(cls, header, frames):
"""
Expand Down Expand Up @@ -383,6 +386,12 @@ def __init__(self, start, stop=None, name=None):
self.name = name
self._cached_values = None

def __contains__(self, item):
if self._start <= item < self._stop:
return True
else:
return False

def copy(self, deep=True):
if deep:
result = deepcopy(self)
Expand Down Expand Up @@ -418,6 +427,8 @@ def __getitem__(self, index):
stop += self._start
if sln == 0:
return RangeIndex(0)
elif step == 1:
return RangeIndex(start, stop)
else:
return index_from_range(start, stop, step)

Expand Down Expand Up @@ -550,8 +561,15 @@ def is_monotonic_decreasing(self):
return self._start >= self._stop

def get_slice_bound(self, label, side, kind):
# TODO: Range-specific implementation here
raise (NotImplementedError)
if label < self._start:
return 0
elif label >= self._stop:
return len(self)
else:
if side == "left":
return label - self._start
elif side == "right":
return (label - self._start) + 1

@property
def __cuda_array_interface__(self):
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length):
for idx, row in enumerate(row_tuple):
if row == slice(None):
continue
if row not in index.levels[idx]:
if row not in index.levels[idx]._column:
raise KeyError(row)
return result

Expand Down
14 changes: 14 additions & 0 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ def __init__(
self._index = RangeIndex(len(data)) if index is None else index
self._name = name

def __contains__(self, item):
return item in self._index

@classmethod
def from_pandas(cls, s, nan_as_null=True):
return cls(s, nan_as_null=nan_as_null)
Expand Down Expand Up @@ -446,6 +449,16 @@ def values_to_string(self, nrows=None):
out = ["" if v is None else str(v) for v in values]
return out

def tolist(self):
"""
Return a list type from series data.
Returns
-------
list
"""
return self.to_arrow().to_pylist()

def head(self, n=5):
return self.iloc[:n]

Expand Down Expand Up @@ -1815,6 +1828,7 @@ def round(self, decimals=0):
self._column.round(decimals=decimals),
name=self.name,
index=self.index,
dtype=self.dtype,
)

def isin(self, test):
Expand Down
Loading

0 comments on commit 5f78259

Please sign in to comment.