import numpy as np import cudf from cudf import DataFrame import xgboost from sklearn.model_selection import train_test_split # ### Prepare synthetic training data num_samples = 1000 gdf = DataFrame() binary_choice = [1, 0] gdf['AGE'] = np.random.randint(1,100, size=(num_samples,)) gdf['DISTANCE'] = np.random.randint(1,50, size=(num_samples,)) def generate_labels(AGE, DISTANCE, LABEL, kwarg1): for i, (age, distance) in enumerate(zip(AGE, DISTANCE)): prob = 0 if (age>18) and (age<30): prob += 0.3 if distance > 10: prob += 0.3 if prob > 0.5: prob = 1 else: prob = 0 LABEL[i] = prob gdf = gdf.apply_rows(generate_labels, incols=['AGE', 'DISTANCE'], outcols=dict(LABEL=np.int), kwargs=dict(kwarg1=1)) print(gdf) # ### Prepare DMatrix directly from the GPU dataframe features = ['AGE', 'DISTANCE'] X_train = gdf[features] y_train = gdf[['LABEL']] # y_train = gdf['LABEL'] doesn't work either dtrain = xgboost.DMatrix(data=X_train, label=y_train) # ### Error #2: However, training XGBoost using this DMatrix throws an error gpu_params = { 'objective': 'binary:logistic', 'n_gpus': -1, 'booster':'gbtree', 'nround': 10, 'max_depth': 3, 'alpha': 0.9, 'eta': 0.1, 'gamma': 0.1, 'learning_rate': 0.5, 'subsample': 1, 'reg_lambda': 1, 'scale_pos_weight': 2, 'min_child_weight': 30, 'tree_method': 'gpu_hist', 'loss': 'ls', 'max_features': 'auto', 'criterion': 'friedman_mse', 'grow_policy': 'lossguide', 'verbose': True } clf = xgboost.train(gpu_params, dtrain=dtrain) # ### Older approach of converting to GDF -> Pandas -> DMatrix still works df = gdf.to_pandas() X_df = df[['AGE', 'DISTANCE']] y_df = df['LABEL'] X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.20, random_state=42) dtrain = xgboost.DMatrix(data=X_train, label=y_train) clf = xgboost.train(gpu_params, dtrain=dtrain)