import numpy as np
import cudf
from cudf import DataFrame
import xgboost
from sklearn.model_selection import train_test_split


# ### Prepare synthetic training data


num_samples = 1000

gdf = DataFrame()
binary_choice = [1, 0]
gdf['AGE'] = np.random.randint(1,100, size=(num_samples,))
gdf['DISTANCE'] = np.random.randint(1,50, size=(num_samples,))

def generate_labels(AGE, DISTANCE, LABEL, kwarg1):
    for i, (age, distance) in enumerate(zip(AGE, DISTANCE)):
        prob = 0
        if (age>18) and (age<30):
            prob += 0.3
        if distance > 10:
            prob += 0.3
       
        if prob > 0.5:
            prob = 1
        else:
            prob = 0
        
        LABEL[i] = prob
        
gdf = gdf.apply_rows(generate_labels, incols=['AGE', 'DISTANCE'], outcols=dict(LABEL=np.int), kwargs=dict(kwarg1=1))
print(gdf)


# ### Prepare DMatrix directly from the GPU dataframe
features = ['AGE', 'DISTANCE']
X_train = gdf[features]
y_train = gdf[['LABEL']] # y_train = gdf['LABEL'] doesn't work either
dtrain = xgboost.DMatrix(data=X_train, label=y_train)


# ### Error #2: However, training XGBoost using this DMatrix throws an error
gpu_params = {
    'objective': 'binary:logistic',
    'n_gpus': -1,
    'booster':'gbtree',
    'nround': 10,
    'max_depth': 3,
    'alpha': 0.9,
    'eta': 0.1,
    'gamma': 0.1,
    'learning_rate': 0.5,
    'subsample': 1,
    'reg_lambda': 1,
    'scale_pos_weight': 2,
    'min_child_weight': 30,
    'tree_method': 'gpu_hist',
    'loss': 'ls',
    'max_features': 'auto',
    'criterion': 'friedman_mse',
    'grow_policy': 'lossguide',
    'verbose': True
    }

clf = xgboost.train(gpu_params, dtrain=dtrain)

# ### Older approach of converting to GDF -> Pandas -> DMatrix still works
df = gdf.to_pandas()
X_df = df[['AGE', 'DISTANCE']]
y_df = df['LABEL']
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.20, random_state=42)
dtrain = xgboost.DMatrix(data=X_train, label=y_train)
clf = xgboost.train(gpu_params, dtrain=dtrain)