Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] cuDF integration into XGBoost #3997

Closed
wants to merge 36 commits into from
Closed
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
2771d31
Added single-GPU GDF (GPU DataFrame) support.
canonizer Sep 27, 2018
6b70684
Fixed the case of 0 rows for DMatrix construction.
canonizer Sep 27, 2018
ea727a6
Reverted the changes to GPU predictor.
canonizer Sep 27, 2018
87bcb8c
Added extracting column names from GDF.
canonizer Oct 9, 2018
370cda9
USE_GDF cmake option.
canonizer Oct 9, 2018
a95419e
Updated GDF cmake module.
canonizer Oct 9, 2018
c198371
Initial commit for cuDF (former GDF) support.
canonizer Nov 15, 2018
32f6d32
Renamed GDF -> cuDF wherever it is currently possible.
canonizer Nov 16, 2018
da3098c
Updated function and type names in new files.
canonizer Nov 16, 2018
6891658
More GDF->cuDF.
canonizer Nov 23, 2018
b5d882e
Lightweight header-only dependency: cudf.h->cudf/types.h.
canonizer Nov 23, 2018
8a3d9d1
Removed cuDF-specific error handling.
canonizer Nov 27, 2018
ad7ff22
FIX refactor cudf branch to match latest cudf developments
mtjrider Dec 6, 2018
00817c4
Merge branch 'master' into cudf
mtjrider Dec 6, 2018
41d2142
updating pygdf to cudf in core and tests
mtjrider Dec 13, 2018
b034242
removing trailing references to gdf.h
mtjrider Dec 13, 2018
669e1e8
addressing merge conflicts with dmlc:master
mtjrider Dec 13, 2018
28bacd4
Merge remote-tracking branch 'origin' into cudf
mtjrider Jan 18, 2019
78b2ea0
removing commented code
mtjrider Jan 18, 2019
1bdd513
removing deprecated objective function
mtjrider Jan 18, 2019
2cef479
removing redundant file
mtjrider Jan 18, 2019
a01f54e
skip test_gpu_gdf.py if cudf fails on import
mtjrider Jan 18, 2019
f0a35e4
removing redundant cmake module
mtjrider Jan 18, 2019
3194a01
removing hard imports in core.py
mtjrider Jan 18, 2019
9fe5f05
updating to dmlc/master
mtjrider Jan 30, 2019
0c796ca
updating GDF references to CUDF
mtjrider Jan 30, 2019
05650ba
Merge branch 'master' into cudf
mtjrider Jan 31, 2019
6e186ff
cleaning up files, comments, and updating naming schema to match dmlc…
mtjrider Feb 24, 2019
d4ef9af
Merge branch 'master' into cudf
Feb 24, 2019
8a6eb6a
correcting code conflict with
mtjrider Feb 25, 2019
fe521d1
Remove redundant files
RAMitchell Feb 25, 2019
f6fb047
cudf interchange format
RAMitchell Feb 26, 2019
b6dcfa6
Lint
RAMitchell Feb 26, 2019
0eee6da
Add google tests
RAMitchell Feb 27, 2019
63c9f8b
Tidy Python code
RAMitchell Feb 28, 2019
9a918a6
Merge branch 'master' into cudf
mtjrider Mar 5, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ msvc_use_static_runtime()

# Options
option(USE_CUDA "Build with GPU acceleration")
option(USE_CUDF "Build with cuDF support. Requires USE_CUDA." OFF)
option(USE_NCCL "Build using NCCL for multi-GPU. Also requires USE_CUDA")
option(JVM_BINDINGS "Build JVM bindings" OFF)
option(GOOGLE_TEST "Build google tests" OFF)
option(R_LIB "Build shared library for R package" OFF)
Expand Down Expand Up @@ -140,6 +142,12 @@ if(USE_CUDA)
add_definitions(-DXGBOOST_USE_NCCL)
endif()

if(USE_CUDF)
find_package(CUDF REQUIRED)
include_directories(${CUDF_INCLUDE_DIR})
add_definitions(-DXGBOOST_USE_CUDF)
endif()

set(GENCODE_FLAGS "")
format_gencode_flags("${GPU_COMPUTE_VER}" GENCODE_FLAGS)
message("cuda architecture flags: ${GENCODE_FLAGS}")
Expand Down
31 changes: 31 additions & 0 deletions cmake/modules/FindCUDF.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Tries to find cuDF headers and libraries.
#
# Usage of this module as follows:
#
# find_package(CUDF)
#
# Variables used by this module, they can change the default behaviour and need
# to be set before calling find_package:
#
# CUDF_ROOT - When set, this path is inspected instead of standard library
# locations as the root of the CUDF installation.
# The environment variable CUDF_ROOT overrides this variable.
#
# This module defines
# CUDF_FOUND, whether cuDF has been found
# CUDF_INCLUDE_DIR, directory containing header
#
# This module assumes that the user has already called find_package(CUDA)


find_path(CUDF_INCLUDE_DIR
NAMES cudf.h cudf/types.h
PATHS $ENV{CUDF_ROOT}/include ${CUDF_ROOT}/include ${CUDA_INCLUDE_DIRS} /usr/include)

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(CUDF DEFAULT_MSG
CUDF_INCLUDE_DIR)

mark_as_advanced(
CUDF_INCLUDE_DIR
)
45 changes: 45 additions & 0 deletions cmake/modules/FindGDF.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Tries to find GDF headers and libraries.
#
# Usage of this module as follows:
#
# find_package(GDF)
#
# Variables used by this module, they can change the default behaviour and need
# to be set before calling find_package:
#
# GDF_ROOT - When set, this path is inspected instead of standard library
# locations as the root of the GDF installation.
# The environment variable GDF_ROOT overrides this variable.
#
# This module defines
# GDF_FOUND, whether nccl has been found
# GDF_INCLUDE_DIR, directory containing header
# GDF_LIBRARY, directory containing nccl library
# GDF_LIB_NAME, nccl library name
#
# This module assumes that the user has already called find_package(CUDA)


set(GDF_LIB_NAME cudf)

find_path(GDF_INCLUDE_DIR
NAMES cudf.h
PATHS $ENV{GDF_ROOT}/include ${GDF_ROOT}/include ${CUDA_INCLUDE_DIRS} /usr/include)

find_library(GDF_LIBRARY
NAMES ${GDF_LIB_NAME}
PATHS $ENV{GDF_ROOT}/lib ${GDF_ROOT}/lib ${CUDA_INCLUDE_DIRS}/../lib /usr/lib)

if (GDF_INCLUDE_DIR AND GDF_LIBRARY)
get_filename_component(GDF_LIBRARY ${GDF_LIBRARY} PATH)
endif ()

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(GDF DEFAULT_MSG
GDF_INCLUDE_DIR GDF_LIBRARY)

mark_as_advanced(
GDF_INCLUDE_DIR
GDF_LIBRARY
GDF_LIB_NAME
)
38 changes: 37 additions & 1 deletion include/xgboost/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
#include <stdint.h>
#endif

#ifdef XGBOOST_USE_CUDF
#include <cudf/types.h>
Copy link
Member

@trivialfis trivialfis Jan 3, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a suggestion, is it possible to use opaque type in c_api.h ? I have been trying to rewrite the CMake scripts and added a installation target, bring in an external header will cause some troubles.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure what you mean. Could you clarify and provide an example?

#endif

// XGBoost C API will include APIs in Rabit C API
#include <rabit/c_api.h>

Expand All @@ -28,7 +32,6 @@
// manually define unsigned long
typedef uint64_t bst_ulong; // NOLINT(*)


/*! \brief handle to DMatrix */
typedef void *DMatrixHandle; // NOLINT(*)
/*! \brief handle to Booster */
Expand Down Expand Up @@ -105,6 +108,20 @@ XGB_DLL const char *XGBGetLastError(void);
*/
XGB_DLL int XGBRegisterLogCallback(void (*callback)(const char*));

#ifdef XGBOOST_USE_CUDF

/*!
* \bried create a data matrix from a CUDA data frame (CUDF)
* \param cols array of CUDF columns
* \param n_cols number of CUDF columns
* \param[out] out handle for the DMatrix built
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGDMatrixCreateFromCUDF
(gdf_column **cols, size_t n_cols, DMatrixHandle *out);

#endif

/*!
* \brief load a data matrix
* \param fname the name of the file
Expand Down Expand Up @@ -282,6 +299,25 @@ XGB_DLL int XGDMatrixSetFloatInfo(DMatrixHandle handle,
const char *field,
const float *array,
bst_ulong len);

#ifdef XGBOOST_USE_CUDF

/*!
* \brief set a vector to
* \param handle a instance of data matrix
* \param field field name, can be label, weight
* \param array pointer to float vector
* \param len length of array
* \return 0 when success, -1 when failure happens
*/

XGB_DLL int XGDMatrixSetInfoCUDF(DMatrixHandle handle,
const char *field,
gdf_column** gdf,
size_t n_cols);

#endif

/*!
* \brief set uint32 vector to a content in info
* \param handle a instance of data matrix
Expand Down
17 changes: 17 additions & 0 deletions include/xgboost/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@

#include <dmlc/base.h>
#include <dmlc/data.h>
#ifdef XGBOOST_USE_CUDF
#include <cudf/types.h>
#endif
#include <cstring>
#include <memory>
#include <numeric>
Expand All @@ -21,6 +24,10 @@

#include "../../src/common/host_device_vector.h"

#include "../../src/common/host_device_vector.h"

#include "../../src/common/host_device_vector.h"

namespace xgboost {
// forward declare learner.
class LearnerImpl;
Expand Down Expand Up @@ -121,6 +128,16 @@ class MetaInfo {
* \param num Number of elements in the source array.
*/
void SetInfo(const char* key, const void* dptr, DataType dtype, size_t num);

#ifdef XGBOOST_USE_CUDF
/*!
* \brief Set information in the meta info from CUDF columns.
* \param key The key of the information.
* \param cols The CUDF columns used to set the info.
* \param n_cols The number of CUDF columns.
*/
void SetInfoCUDF(const char* key, gdf_column** cols, size_t n_cols);
#endif

private:
/*! \brief argsort of labels */
Expand Down
54 changes: 49 additions & 5 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@
import re
import sys

import cudf
from libgdf_cffi import ffi
import numpy as np
import cudf.dataframe as gdf
import scipy.sparse

from .compat import (STRING_TYPES, PY3, DataFrame, MultiIndex, py_str,
Expand Down Expand Up @@ -227,7 +230,7 @@ def c_array(ctype, values):
def _maybe_pandas_data(data, feature_names, feature_types):
""" Extract internal data from pd.DataFrame for DMatrix data """

if not isinstance(data, DataFrame):
if not isinstance(data, (DataFrame, cudf.DataFrame)):
return data, feature_names, feature_types

data_dtypes = data.dtypes
Expand All @@ -240,18 +243,24 @@ def _maybe_pandas_data(data, feature_names, feature_types):
raise ValueError(msg + ', '.join(bad_fields))

if feature_names is None:
if isinstance(data.columns, MultiIndex):
data_columns = data.columns
if isinstance(data_columns, MultiIndex):
feature_names = [
' '.join(map(str, i))
for i in data.columns
for i in data_columns
]
elif (isinstance(data_columns, (tuple, list)) and len(data_columns) > 0 and
isinstance(data_columns[0], str)):
feature_names = list(data_columns)
else:
feature_names = data.columns.format()
feature_names = data_columns.format()

if feature_types is None:
feature_types = [PANDAS_DTYPE_MAPPER[dtype.name] for dtype in data_dtypes]

data = data.values.astype('float')
# only convert pandas.DataFrame, CUDF conversion happens elsewhere
if isinstance(data, DataFrame):
data = data.values.astype('float')

return data, feature_names, feature_types

Expand Down Expand Up @@ -386,6 +395,8 @@ def __init__(self, data, label=None, missing=None,
_check_call(_LIB.XGDMatrixCreateFromFile(c_str(data),
ctypes.c_int(silent),
ctypes.byref(self.handle)))
elif isinstance(data, cudf.DataFrame):
self._init_from_cudf(data)
elif isinstance(data, scipy.sparse.csr_matrix):
self._init_from_csr(data)
elif isinstance(data, scipy.sparse.csc_matrix):
Expand All @@ -405,17 +416,37 @@ def __init__(self, data, label=None, missing=None,
if label is not None:
if isinstance(label, np.ndarray):
self.set_label_npy2d(label)
elif isinstance(label, cudf.dataframe.column.Column):
self.set_info_cudf('label', label)
elif isinstance(label, cudf.DataFrame):
self.set_info_cudf('label', label)
else:
self.set_label(label)
if weight is not None:
if isinstance(weight, np.ndarray):
self.set_weight_npy2d(weight)
elif isinstance(weight, cudf.dataframe.column.Column):
self.set_info_cudf('weight', weight)
elif isinstance(weight, cudf.DataFrame):
self.set_info_cudf('weight', weight)
else:
self.set_weight(weight)

self.feature_names = feature_names
self.feature_types = feature_types

def _init_from_cudf(self, df):
"""
Initialize data from a GPU data frame.
"""
self.handle = ctypes.c_void_p()
col_ptrs = [df[col]._column.cffi_view for col in df.columns]
col_ptr_arr = ffi.new('gdf_column*[]', col_ptrs)
_check_call(_LIB.XGDMatrixCreateFromCUDF
(ctypes.c_void_p(int(ffi.cast('uintptr_t', col_ptr_arr))),
ctypes.c_size_t(len(df.columns)),
ctypes.byref(self.handle)))

def _init_from_csr(self, csr):
"""
Initialize data from a CSR matrix.
Expand Down Expand Up @@ -592,6 +623,19 @@ def set_float_info_npy2d(self, field, data):
c_data,
c_bst_ulong(len(data))))

def set_info_cudf(self, field, data):
col_ptrs = []
if isinstance(data, cudf.DataFrame):
col_ptrs = [data[col]._column.cffi_view for col in data.columns]
else:
# data is a single CUDF column
col_ptrs = [data.cffi_view]
col_ptr_arr = ffi.new('gdf_column*[]', col_ptrs)
_check_call(_LIB.XGDMatrixSetInfoCUDF
(self.handle, c_str(field),
ctypes.c_void_p(int(ffi.cast('uintptr_t', col_ptr_arr))),
ctypes.c_size_t(len(col_ptrs))))

def set_uint_info(self, field, data):
"""Set uint type property into the DMatrix.

Expand Down
28 changes: 28 additions & 0 deletions src/c_api/c_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,19 @@ int XGDMatrixCreateFromDataIter(
API_END();
}

#ifdef XGBOOST_USE_CUDF

int XGDMatrixCreateFromCUDF
(gdf_column **cols, size_t n_cols, DMatrixHandle *out) {
API_BEGIN();
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
source->InitFromCUDF(cols, n_cols);
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
API_END();
}

#endif

XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr,
const unsigned* indices,
const bst_float* data,
Expand Down Expand Up @@ -778,6 +791,21 @@ XGB_DLL int XGDMatrixSetFloatInfo(DMatrixHandle handle,
API_END();
}

#ifdef XGBOOST_USE_CUDF

XGB_DLL int XGDMatrixSetInfoCUDF(DMatrixHandle handle,
const char *field,
gdf_column **cols,
size_t n_cols) {
API_BEGIN();
CHECK_HANDLE();
static_cast<std::shared_ptr<DMatrix>*>(handle)
->get()->Info().SetInfoCUDF(field, cols, n_cols);
API_END();
}

#endif

XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle,
const char* field,
const unsigned* info,
Expand Down
Loading