import numpy as np import cudf from cudf import DataFrame import xgboost from sklearn.model_selection import train_test_split # ### Prepare synthetic training data num_samples = 1000 gdf = DataFrame() binary_choice = [1, 0] gdf['AGE'] = np.random.randint(1,100, size=(num_samples,)) gdf['DISTANCE'] = np.random.randint(1,50, size=(num_samples,)) weekday_dict = { 1: 'MON', 2: 'TUE', 3: 'WED', 4: 'THU', 5: 'FRI' } weekdays = list(weekday_dict.keys()) gdf['APPT_WEEKDAY'] = np.random.choice(weekdays, size=num_samples) gdf = gdf.one_hot_encoding('APPT_WEEKDAY', 'APPT_WEEKDAY', weekdays, dtype='uint8') gdf.drop_column('APPT_WEEKDAY') def generate_labels(AGE, DISTANCE, LABEL, kwarg1): for i, (age, distance) in enumerate(zip(AGE, DISTANCE)): prob = 0 if (age>18) and (age<30): prob += 0.3 if distance > 10: prob += 0.3 if prob > 0.5: prob = 1 else: prob = 0 LABEL[i] = prob gdf = gdf.apply_rows(generate_labels, incols=['AGE', 'DISTANCE'], outcols=dict(LABEL=np.int), kwargs=dict(kwarg1=1)) print(gdf) print(gdf.dtypes) features = gdf.columns.tolist() features.remove('LABEL') X_train = gdf[features] y_train = gdf[['LABEL']] dtrain = xgboost.DMatrix(data=X_train, label=y_train) # ### Removing the uint8 features resolves the issue. features = ['AGE', 'DISTANCE'] X_train = gdf[features] y_train = gdf[['LABEL']] # y_train = gdf['LABEL'] doesn't work either dtrain = xgboost.DMatrix(data=X_train, label=y_train)