forked from aws/amazon-sagemaker-examples
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'dt-lgb' of github.com:Neo9061/amazon-sagemaker-examples…
… into dt-lgb
- Loading branch information
Showing
22 changed files
with
2,736 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
33 changes: 33 additions & 0 deletions
33
..._applying_machine_learning/fraud_detection_using_graph_neural_networks/baselines/utils.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import os | ||
import pandas as pd | ||
|
||
def get_data(): | ||
data_prefix = "preprocessed-data/" | ||
|
||
if not os.path.exists(data_prefix): | ||
print("""Expected the following folder {} to contain the preprocessed data. | ||
Run data processing first in main notebook before running baselines comparisons""".format(data_prefix)) | ||
return | ||
|
||
features = pd.read_csv(data_prefix + "features_xgboost.csv", header=None) | ||
labels = pd.read_csv(data_prefix + "tags.csv").set_index('TransactionID') | ||
valid_users = pd.read_csv(data_prefix + "validation.csv", header=None) | ||
test_users = pd.read_csv(data_prefix + "test.csv", header=None) | ||
|
||
valid_X = features.merge(valid_users, on=[0], how='inner') | ||
test_X = features.merge(test_users, on=[0], how='inner') | ||
|
||
train_index = ~((features[0].isin(test_users[0].values) | (features[0].isin(valid_users[0].values)))) | ||
train_X = features[train_index] | ||
valid_y = labels.loc[valid_X[0]] | ||
test_y = labels.loc[test_X[0]] | ||
train_y = labels.loc[train_X[0]] | ||
|
||
train_X.set_index([0], inplace=True) | ||
valid_X.set_index([0], inplace=True) | ||
test_X.set_index([0], inplace=True) | ||
|
||
train_data = train_y.join(train_X) # first column is the label 'isFraud' | ||
valid_data = valid_y.join(valid_X) | ||
test_data = test_y.join(test_X) | ||
return train_data, valid_data, test_data |
88 changes: 88 additions & 0 deletions
88
...ine_learning/fraud_detection_using_graph_neural_networks/data-generation/generate_data.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
from faker import Faker | ||
import datetime | ||
import itertools | ||
import numpy as np | ||
import pandas as pd | ||
Faker.seed(0) | ||
np.random.seed(0) | ||
|
||
NUM_UNIQUE_CCS = 40*10**3 | ||
START_TRANS_DATE = datetime.datetime(2012, 1, 15) | ||
END_TRANS_DATE = datetime.datetime(2012, 3, 15) | ||
|
||
def gen_fraud_data(num_unique_ccs=NUM_UNIQUE_CCS, start_trans_date=START_TRANS_DATE, end_trans_date=END_TRANS_DATE): | ||
fake = Faker() | ||
cc_nums = [fake.credit_card_number() for _ in range(num_unique_ccs)] | ||
cc_types = [fake.credit_card_provider()for _ in range(num_unique_ccs)] | ||
num_trans_per_cc = np.ceil(np.random.exponential(scale=3, size=num_unique_ccs)).astype(np.int32) | ||
cc_ipv4 = [fake.ipv4() for _ in range(num_unique_ccs)] | ||
cc_phone_number = [fake.phone_number()for _ in range(num_unique_ccs)] | ||
cc_device_id = [fake.msisdn()for _ in range(num_unique_ccs)] | ||
|
||
data = { | ||
'TransactionID': [fake.uuid4() for _ in range(sum(num_trans_per_cc))], | ||
'TransactionDT': [fake.date_time_between_dates(datetime_start=start_trans_date, datetime_end=end_trans_date) | ||
for _ in range(sum(num_trans_per_cc))], | ||
'card_no': list(itertools.chain.from_iterable([[cc_num]*num_trans for cc_num, num_trans in zip(cc_nums, num_trans_per_cc)])), | ||
'card_type': list(itertools.chain.from_iterable([[card]*num_trans for card, num_trans in zip(cc_types, num_trans_per_cc)])), | ||
'email_domain': [fake.ascii_email().split("@")[1] for _ in range(sum(num_trans_per_cc))], | ||
'ProductCD': np.random.choice(['45', 'AB', 'L', 'Y', 'T'], size=sum(num_trans_per_cc)), | ||
'TransactionAmt': np.abs(np.ceil(np.random.exponential(scale=10, size=sum(num_trans_per_cc))*100)).astype(np.int32), | ||
} | ||
transactions = pd.DataFrame(data).sort_values(by=['TransactionDT']) | ||
|
||
# if you want to make the # of observations in the identity table less than that in the transactions table which may be more realistic in a practical scenario, change the size argument below. | ||
identity_transactions_idx = np.random.choice(transactions.shape[0], size=int(transactions.shape[0]*1.0), replace=False) | ||
id_data = { | ||
'IpAddress': list(itertools.chain.from_iterable([[ipv4]*num_trans for ipv4, num_trans in zip(cc_ipv4, num_trans_per_cc)])), | ||
'PhoneNo' : list(itertools.chain.from_iterable([[phone_num]*num_trans for phone_num, num_trans in zip(cc_phone_number, num_trans_per_cc)])), | ||
'DeviceID': list(itertools.chain.from_iterable([[device_id]*num_trans for device_id, num_trans in zip(cc_device_id, num_trans_per_cc)])), | ||
} | ||
identity = pd.DataFrame(id_data) | ||
identity["TransactionID"] = transactions.TransactionID | ||
assert identity.shape[0] == transactions.shape[0] | ||
|
||
identity = identity.loc[identity_transactions_idx] | ||
identity.reset_index(drop=True, inplace=True) | ||
identity = identity[["TransactionID", "IpAddress", "PhoneNo", "DeviceID"]] | ||
identity = pd.DataFrame(id_data) | ||
|
||
|
||
# join two tables for the convenience of generating label column 'isFraud' | ||
full_two_df = transactions[["TransactionID", "card_no", "card_type", "email_domain", "ProductCD", "TransactionAmt"]].merge(identity, on='TransactionID', how='left') | ||
|
||
is_fraud = [] | ||
for idx, row in full_two_df.iterrows(): | ||
card_no, card_type, email, product_type, transcation_amount, ip_address, phone_no, device_id = str(row["card_no"]), row["card_type"], row["email_domain"], row["ProductCD"], row["TransactionAmt"], str(row["IpAddress"]), str(row["PhoneNo"]), str(row["DeviceID"]) | ||
|
||
if email in ["hotmail.com", "gmail.com", "yahoo.com"]: | ||
if product_type in ["45"]: | ||
is_fraud.append(int(np.random.uniform() < 0.9)) | ||
else: | ||
if (device_id != "nan") and (device_id.endswith("16") or device_id.endswith("78") or device_id.endswith("23")): | ||
is_fraud.append(int(np.random.uniform() < 0.1)) | ||
else: | ||
is_fraud.append(int(np.random.uniform() < 0.05)) | ||
else: | ||
if transcation_amount > 3000: | ||
is_fraud.append(int(np.random.uniform() < 0.8)) | ||
else: | ||
if card_type in ["Diners Club / Carte Blanche", "JCB 15 digit", "Maestro"]: # about 35,000 observations are in this categires | ||
if (card_no.endswith("001") or card_no.endswith("002") or card_no.endswith("003") or card_no.endswith("004") or card_no.endswith("005") or card_no.endswith("007") or card_no.endswith("008") or card_no.endswith("009")) or ((phone_no != "nan") and (phone_no.endswith(".227") or phone_no.endswith(".104") or phone_no.endswith(".251") or phone_no.endswith(".181"))): | ||
is_fraud.append(int(np.random.uniform() < 0.3)) | ||
else: | ||
if (ip_address != "nan") and (ip_address.endswith(".227") or ip_address.endswith(".104") or ip_address.endswith(".251") or ip_address.endswith(".181")): | ||
is_fraud.append(int(np.random.uniform() < 0.2)) | ||
else: | ||
is_fraud.append(int(np.random.uniform() < 0.1)) | ||
else: | ||
is_fraud.append(int(np.random.uniform() < 0.0001)) | ||
print("fraud ratio", sum(is_fraud)/ len(is_fraud)) | ||
|
||
transactions['isFraud'] = is_fraud | ||
return transactions, identity | ||
|
||
if __name__ == '__main__': | ||
transaction, identity = gen_fraud_data() | ||
transaction.to_csv('raw_data/transaction.csv', index=False) | ||
identity.to_csv('raw_data/identity.csv', index=False) |
144 changes: 144 additions & 0 deletions
144
...fraud_detection_using_graph_neural_networks/data-preprocessing/graph_data_preprocessor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
import argparse | ||
import logging | ||
import os | ||
|
||
import pandas as pd | ||
import numpy as np | ||
from itertools import combinations | ||
|
||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--data-dir', type=str, default='/opt/ml/processing/input') | ||
parser.add_argument('--output-dir', type=str, default='/opt/ml/processing/output') | ||
parser.add_argument('--transactions', type=str, default='transaction.csv', help='name of file with transactions') | ||
parser.add_argument('--identity', type=str, default='identity.csv', help='name of file with identity info') | ||
parser.add_argument('--id-cols', type=str, default='', help='comma separated id cols in transactions table') | ||
parser.add_argument('--cat-cols', type=str, default='', help='comma separated categorical cols in transactions') | ||
parser.add_argument('--cat-cols-xgboost', type=str, default='', help='comma separated categorical cols that can be used as features for xgboost in transactions') | ||
parser.add_argument('--train-data-ratio', type=float, default=0.7, help='fraction of data to use in training set') | ||
parser.add_argument('--valid-data-ratio', type=float, default=0.2, help='fraction of data to use in validation set') | ||
parser.add_argument('--construct-homogeneous', action="store_true", default=False, | ||
help='use bipartite graphs edgelists to construct homogenous graph edgelist') | ||
return parser.parse_args() | ||
|
||
|
||
def get_logger(name): | ||
logger = logging.getLogger(name) | ||
log_format = '%(asctime)s %(levelname)s %(name)s: %(message)s' | ||
logging.basicConfig(format=log_format, level=logging.INFO) | ||
logger.setLevel(logging.INFO) | ||
return logger | ||
|
||
|
||
def load_data(data_dir, transaction_data, identity_data, train_data_ratio, valid_data_ratio, output_dir): | ||
transaction_df = pd.read_csv(os.path.join(data_dir, transaction_data)) | ||
logging.info("Shape of transaction data is {}".format(transaction_df.shape)) | ||
logging.info("# Tagged transactions: {}".format(len(transaction_df) - transaction_df.isFraud.isnull().sum())) | ||
|
||
identity_df = pd.read_csv(os.path.join(data_dir, identity_data)) | ||
logging.info("Shape of identity data is {}".format(identity_df.shape)) | ||
|
||
# extract out transactions for train, validation, and test data | ||
logging.info("Training, validation, and test data fraction are {}, {}, and {}, respectively".format(train_data_ratio, valid_data_ratio, 1-train_data_ratio-valid_data_ratio)) | ||
assert train_data_ratio + valid_data_ratio < 1, "The sum of training and validation ratio is found more than or equal to 1." | ||
n_train = int(transaction_df.shape[0]*train_data_ratio) | ||
n_valid = int(transaction_df.shape[0]*(train_data_ratio+valid_data_ratio)) | ||
valid_ids = transaction_df.TransactionID.values[n_train:n_valid] | ||
test_ids = transaction_df.TransactionID.values[n_valid:] | ||
|
||
get_fraud_frac = lambda series: 100 * sum(series)/len(series) | ||
logging.info("Percentage of fraud transactions for train data: {}".format(get_fraud_frac(transaction_df.isFraud[:n_train]))) | ||
logging.info("Percentage of fraud transactions for validation data: {}".format(get_fraud_frac(transaction_df.isFraud[n_train:n_valid]))) | ||
logging.info("Percentage of fraud transactions for test data: {}".format(get_fraud_frac(transaction_df.isFraud[n_valid:]))) | ||
logging.info("Percentage of fraud transactions for all data: {}".format(get_fraud_frac(transaction_df.isFraud))) | ||
|
||
with open(os.path.join(output_dir, 'validation.csv'), 'w') as f: | ||
f.writelines(map(lambda x: str(x) + "\n", valid_ids)) | ||
logging.info("Wrote validaton data to file: {}".format(os.path.join(output_dir, 'validation.csv'))) | ||
|
||
with open(os.path.join(output_dir, 'test.csv'), 'w') as f: | ||
f.writelines(map(lambda x: str(x) + "\n", test_ids)) | ||
logging.info("Wrote test data to file: {}".format(os.path.join(output_dir, 'test.csv'))) | ||
|
||
return transaction_df, identity_df, valid_ids, test_ids | ||
|
||
|
||
def get_features_and_labels(transactions_df, transactions_id_cols, transactions_cat_cols, transactions_cat_cols_xgboost, output_dir): | ||
# Get features | ||
non_feature_cols = ['isFraud', 'TransactionDT'] + transactions_id_cols.split(",") | ||
feature_cols = [col for col in transactions_df.columns if col not in non_feature_cols] | ||
logging.info("Categorical columns: {}".format(transactions_cat_cols.split(","))) | ||
features = pd.get_dummies(transactions_df[feature_cols], columns=transactions_cat_cols.split(",")).fillna(0) | ||
features['TransactionAmt'] = features['TransactionAmt'].apply(np.log10) | ||
logging.info("Transformed feature columns: {}".format(list(features.columns))) | ||
logging.info("Shape of features: {}".format(features.shape)) | ||
features.to_csv(os.path.join(output_dir, 'features.csv'), index=False, header=False) | ||
logging.info("Wrote features to file: {}".format(os.path.join(output_dir, 'features.csv'))) | ||
|
||
|
||
logging.info("Processing feature columns for XGBoost.") | ||
cat_cols_xgb = transactions_cat_cols_xgboost.split(",") | ||
logging.info("Categorical feature columns for XGBoost: {}".format(cat_cols_xgb)) | ||
logging.info("Numerical feature column for XGBoost: 'TransactionAmt'") | ||
features_xgb = pd.get_dummies(transactions_df[['TransactionID']+cat_cols_xgb], columns=cat_cols_xgb).fillna(0) | ||
features_xgb['TransactionAmt'] = features['TransactionAmt'] | ||
features_xgb.to_csv(os.path.join(output_dir, 'features_xgboost.csv'), index=False, header=False) | ||
logging.info("Wrote features to file: {}".format(os.path.join(output_dir, 'features_xgboost.csv'))) | ||
|
||
# Get labels | ||
transactions_df[['TransactionID', 'isFraud']].to_csv(os.path.join(output_dir, 'tags.csv'), index=False) | ||
logging.info("Wrote labels to file: {}".format(os.path.join(output_dir, 'tags.csv'))) | ||
|
||
|
||
def get_relations_and_edgelist(transactions_df, identity_df, transactions_id_cols, output_dir): | ||
# Get relations | ||
edge_types = transactions_id_cols.split(",") + list(identity_df.columns) | ||
logging.info("Found the following distinct relation types: {}".format(edge_types)) | ||
id_cols = ['TransactionID'] + transactions_id_cols.split(",") | ||
full_identity_df = transactions_df[id_cols].merge(identity_df, on='TransactionID', how='left') | ||
logging.info("Shape of identity columns: {}".format(full_identity_df.shape)) | ||
|
||
# extract edges | ||
edges = {} | ||
for etype in edge_types: | ||
edgelist = full_identity_df[['TransactionID', etype]].dropna() | ||
edgelist.to_csv(os.path.join(output_dir, 'relation_{}_edgelist.csv').format(etype), index=False, header=True) | ||
logging.info("Wrote edgelist to: {}".format(os.path.join(output_dir, 'relation_{}_edgelist.csv').format(etype))) | ||
edges[etype] = edgelist | ||
return edges | ||
|
||
|
||
def create_homogeneous_edgelist(edges, output_dir): | ||
homogeneous_edges = [] | ||
for etype, relations in edges.items(): | ||
for edge_relation, frame in relations.groupby(etype): | ||
new_edges = [(a, b) for (a, b) in combinations(frame.TransactionID.values, 2) | ||
if (a, b) not in homogeneous_edges and (b, a) not in homogeneous_edges] | ||
homogeneous_edges.extend(new_edges) | ||
|
||
with open(os.path.join(output_dir, 'homogeneous_edgelist.csv'), 'w') as f: | ||
f.writelines(map(lambda x: "{}, {}\n".format(x[0], x[1]), homogeneous_edges)) | ||
logging.info("Wrote homogeneous edgelist to file: {}".format(os.path.join(output_dir, 'homogeneous_edgelist.csv'))) | ||
|
||
|
||
if __name__ == '__main__': | ||
logging = get_logger(__name__) | ||
|
||
args = parse_args() | ||
|
||
transactions, identity, _, _ = load_data(args.data_dir, | ||
args.transactions, | ||
args.identity, | ||
args.train_data_ratio, | ||
args.valid_data_ratio, | ||
args.output_dir) | ||
|
||
get_features_and_labels(transactions, args.id_cols, args.cat_cols, args.cat_cols_xgboost, args.output_dir) | ||
relational_edges = get_relations_and_edgelist(transactions, identity, args.id_cols, args.output_dir) | ||
|
||
if args.construct_homogeneous: | ||
create_homogeneous_edgelist(relational_edges, args.output_dir) | ||
|
||
|
||
|
Oops, something went wrong.