Skip to content

whyjust/wgcpy

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

54 Commits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

wgcpy 包名

Data analysis and PMML model framework

version 1.0.0

Different modules of the Package are provided for everyone to use.

  • data detect
  • variable eda
  • the way of cut variable bins
  • calculate iv or psi
  • auto feature selector
  • generate PMML model

Require

  • python 3.5 or newer
  • Java 1.8 or newer. The java executable must be available on system path.

Install

GitLab安装

pip install --upgrade https://github.com/whyjust/wgcpy

Structure

wgcpy package tree structure:

WGCPY
D:\GITHUB\WGCPY
│  .gitignore
│  info.log
│  LICENSE
│  main.py
│  MANIFEST.in
│  README.md
│  requirements.txt
│  setup.py
│
├─data
│
├─pic
│
├─result
│
└─wgcpy
    │  config.py
    │  __init__.py
    │
    ├─bins
    │      chi_merge.py
    │      cut_bins.py
    │      __init__.py
    │
    ├─featureSelector
    │      cal_iv_psi.py
    │      cal_iv_psi_special.py
    │      selector.py
    │      __init__.py
    │
    ├─model
    │      dz_eval.py
    │      gen_model.py
    │      gen_pmml_model.py
    │      __init__.py
    │
    ├─preprocessing
    │      baggingPU.py
    │      data_detection.py
    │      eda.py
    │      __init__.py
    │
    └─utils
            ext_fn.py
            __init__.py

Usage

1 main.py运行
python main.py
2 数据EDA模块
plot_feature_boxplot(credit_data, numeric_feats)
plot_feature_distribution(credit_data, numeric_feats,
                          label="flag", sub_col=3)
plot_category_countplot(credit_data, category_feats, label="flag",
                        sub_col=5, figsize=(20,12))
plot_corr(credit_data, numeric_feats+['flag'], mask=True)
3 数据探查
# 数据分布
with timer('detect dataframe'):
    dec = DetectDF(credit_data)
    df_des = dec.detect(special_value_dict={-999:np.nan},
                        output=os.path.join(base_dir, "result"))
Pubagging
with timer('pu bagging'):
    estimator = LGBMClassifier(n_estimators=200, max_depth=2, learning_rate=0.1)
    bc = BaggingClassifierPU(base_estimator=estimator, 
                            n_estimators = 30, 
                            n_jobs = -1, 
                            max_samples = len(credit_data[credit_data['flag']==1]))
    bc.fit(credit_data[numeric_feats], credit_data['flag'])
    score_arr = bc.oob_decision_function_[:,1]
    credit_data['score_pb'] = score_arr
    credit_data = credit_data[(credit_data['score_pb'].isna()) | (credit_data['score_pb']<0.9)]
    print('PUbagging-shape:', credit_data.shape)
4 计算IV
with timer("cal iv"):
    iv_details = cal_total_var_iv(credit_data,
                                  numeric_feats=numeric_feats,
                                  category_feats=category_feats,
                                  target='flag',
                                  max_interval=10,
                                  method='tree')
    fig = plot_bin_woe(binx=iv_details[iv_details['variable'] == 'credit.amount'],
                       title=None,
                       display_iv=True)
    iv_details.to_csv(os.path.join(base_dir,r'result\iv_details.csv'), index=False)
5 计算PSI
with timer('cal psi'):
    except_array = credit_data['credit.amount'][:500]
    accept_arry = credit_data['credit.amount'][500:]
    psi_df = numeric_var_cal_psi(except_array,
                                 accept_arry,
                                 bins=10,
                                 bucket_type='bins',
                                 detail=True)
    psi_df.to_csv(os.path.join(base_dir, r'result\psi.csv'))
6 特征初筛与细筛
with timer("cal cv score"):
    groups = credit_data['status.of.existing.checking.account']
    config = {
        "na_threshold": 0.95,
        "correlation_threshold": 0.6,
        "importance_cumsum_threshold": 0.95,
        "params": {
            "n_estimators": 200,
            "max_depth": 2,
            "learning_rate": 0.1,
            "boosting_type": "gbdt",
            "importance_type": "gain",
            "n_jobs": -1
        },
        "kfold": "StratifiedKFold",
        "groups": None,
        "categorical_feature": category_feats,
        "n_splits": 5,
        "incre_params": None,
        "total_iter": 20,
        "step": 1,
        "auc_interval": None
    }
    fs = FeatureSelector(data=credit_data,
                         target='flag',
                         base_features=numeric_feats+category_feats)
    fs.identify_all(config=config)
    fs.plot_feature_importance()
    fs.result_save(output=os.path.join(base_dir, r".\result\feats_seletor_result.xlsx"))
7 PMML建模与评估
with timer("PMML model build"):
        trn_x, tes_x, y_trn, y_tes = train_test_split(credit_data,
                                                      credit_data['flag'],
                                                      test_size=0.2)
        base_feature = numeric_feats+category_feats
        cat_indices = [index for index,v in enumerate(base_feature) if v in category_feats]
        fit_params = {"classifier__categorical_feature" : cat_indices}
        print('cat_indices:', cat_indices)

        pmml_model = genPMMLModel(data=trn_x,
                                  target="flag",
                                  base_features=base_feature)

        pmml_model.make_pipeline_model(numeric_feature=numeric_feats,
                                       category_feature=category_feats,
                                       model_type='lgb',
                                       param_dict=config['params'],
                                       fit_params=fit_params)

        predict = pmml_model.evaluate(data=tes_x,
                                      target="flag")
        pmml_model.persist(base_dir="result",
                           model_name="credit")

Let's started! Welcome to star!

About

Data analysis and PMML model framework

Topics

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages