Skip to content

Commit

Permalink
Merge branch 'dt-lgb' of github.com:Neo9061/amazon-sagemaker-examples…
Browse files Browse the repository at this point in the history
… into dt-lgb
  • Loading branch information
Neo9061 committed Jan 19, 2023
2 parents b66ca49 + 27a1f1f commit 1038f9a
Show file tree
Hide file tree
Showing 22 changed files with 2,736 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ These examples provide a gentle introduction to machine learning concepts as the
- [Traffic violations forecasting using DeepAR](introduction_to_applying_machine_learning/deepar_chicago_traffic_violations) is an example to use daily traffic violation data to predict pattern and seasonality to use Amazon DeepAR alogorithm.
- [Visual Inspection Automation with Pre-trained Amazon SageMaker Models](introduction_to_applying_machine_learning/visual_object_detection) is an example for fine-tuning pre-trained Amazon Sagemaker models on a target dataset.
- [Create SageMaker Models Using the PyTorch Model Zoo](introduction_to_applying_machine_learning/sagemaker_pytorch_model_zoo) contains an example notebook to create a SageMaker model leveraging the PyTorch Model Zoo and visualize the results.
- [Fraud Detection Using Graph Neural Networks](introduction_to_applying_machine_learning/fraud_detection_using_graph_neural_networks) is an example to identify fraudulent transactions from transaction and user identity datasets.
- [Identify key insights from textual document](introduction_to_applying_machine_learning/identify_key_insights_from_textual_document) contains comphrensive notebooks for five natural language processing tasks Document Summarization, Text Classification, Question and Answering, Name Entity Recognition, and Semantic Relation Extracion.
- [Synthetic Churn Prediction with Text](introduction_to_applying_machine_learning/synthetic_churn_prediction_with_text) contains an example notebook to train, deploy and use a churn prediction model that processed numerical, categorical and textual features to make its prediction.

Expand Down
1 change: 1 addition & 0 deletions introduction_to_applying_machine_learning/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@ These examples provide a gentle introduction to machine learning concepts as the
- [Traffic violations forecasting using DeepAR](deepar_chicago_traffic_violations) is an example to use daily traffic violation data to predict pattern and seasonality to use Amazon DeepAR alogorithm.
- [Visual Inspection Automation with Pre-trained Amazon SageMaker Models](visual_object_detection) is an example for fine-tuning pre-trained Amazon Sagemaker models on a target dataset.
- [Create SageMaker Models Using the PyTorch Model Zoo](sagemaker_pytorch_model_zoo) contains an example notebook to create a SageMaker model leveraging the PyTorch Model Zoo and visualize the results.
- [Fraud Detection Using Graph Neural Networks](fraud_detection_using_graph_neural_networks) is an example to identify fraudulent transactions from transaction and user identity datasets.
- [Identify key insights from textual document](identify_key_insights_from_textual_document) contains comphrensive notebooks for five natural language processing tasks Document Summarization, Text Classification, Question and Answering, Name Entity Recognition, and Semantic Relation Extracion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os
import pandas as pd

def get_data():
data_prefix = "preprocessed-data/"

if not os.path.exists(data_prefix):
print("""Expected the following folder {} to contain the preprocessed data.
Run data processing first in main notebook before running baselines comparisons""".format(data_prefix))
return

features = pd.read_csv(data_prefix + "features_xgboost.csv", header=None)
labels = pd.read_csv(data_prefix + "tags.csv").set_index('TransactionID')
valid_users = pd.read_csv(data_prefix + "validation.csv", header=None)
test_users = pd.read_csv(data_prefix + "test.csv", header=None)

valid_X = features.merge(valid_users, on=[0], how='inner')
test_X = features.merge(test_users, on=[0], how='inner')

train_index = ~((features[0].isin(test_users[0].values) | (features[0].isin(valid_users[0].values))))
train_X = features[train_index]
valid_y = labels.loc[valid_X[0]]
test_y = labels.loc[test_X[0]]
train_y = labels.loc[train_X[0]]

train_X.set_index([0], inplace=True)
valid_X.set_index([0], inplace=True)
test_X.set_index([0], inplace=True)

train_data = train_y.join(train_X) # first column is the label 'isFraud'
valid_data = valid_y.join(valid_X)
test_data = test_y.join(test_X)
return train_data, valid_data, test_data
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from faker import Faker
import datetime
import itertools
import numpy as np
import pandas as pd
Faker.seed(0)
np.random.seed(0)

NUM_UNIQUE_CCS = 40*10**3
START_TRANS_DATE = datetime.datetime(2012, 1, 15)
END_TRANS_DATE = datetime.datetime(2012, 3, 15)

def gen_fraud_data(num_unique_ccs=NUM_UNIQUE_CCS, start_trans_date=START_TRANS_DATE, end_trans_date=END_TRANS_DATE):
fake = Faker()
cc_nums = [fake.credit_card_number() for _ in range(num_unique_ccs)]
cc_types = [fake.credit_card_provider()for _ in range(num_unique_ccs)]
num_trans_per_cc = np.ceil(np.random.exponential(scale=3, size=num_unique_ccs)).astype(np.int32)
cc_ipv4 = [fake.ipv4() for _ in range(num_unique_ccs)]
cc_phone_number = [fake.phone_number()for _ in range(num_unique_ccs)]
cc_device_id = [fake.msisdn()for _ in range(num_unique_ccs)]

data = {
'TransactionID': [fake.uuid4() for _ in range(sum(num_trans_per_cc))],
'TransactionDT': [fake.date_time_between_dates(datetime_start=start_trans_date, datetime_end=end_trans_date)
for _ in range(sum(num_trans_per_cc))],
'card_no': list(itertools.chain.from_iterable([[cc_num]*num_trans for cc_num, num_trans in zip(cc_nums, num_trans_per_cc)])),
'card_type': list(itertools.chain.from_iterable([[card]*num_trans for card, num_trans in zip(cc_types, num_trans_per_cc)])),
'email_domain': [fake.ascii_email().split("@")[1] for _ in range(sum(num_trans_per_cc))],
'ProductCD': np.random.choice(['45', 'AB', 'L', 'Y', 'T'], size=sum(num_trans_per_cc)),
'TransactionAmt': np.abs(np.ceil(np.random.exponential(scale=10, size=sum(num_trans_per_cc))*100)).astype(np.int32),
}
transactions = pd.DataFrame(data).sort_values(by=['TransactionDT'])

# if you want to make the # of observations in the identity table less than that in the transactions table which may be more realistic in a practical scenario, change the size argument below.
identity_transactions_idx = np.random.choice(transactions.shape[0], size=int(transactions.shape[0]*1.0), replace=False)
id_data = {
'IpAddress': list(itertools.chain.from_iterable([[ipv4]*num_trans for ipv4, num_trans in zip(cc_ipv4, num_trans_per_cc)])),
'PhoneNo' : list(itertools.chain.from_iterable([[phone_num]*num_trans for phone_num, num_trans in zip(cc_phone_number, num_trans_per_cc)])),
'DeviceID': list(itertools.chain.from_iterable([[device_id]*num_trans for device_id, num_trans in zip(cc_device_id, num_trans_per_cc)])),
}
identity = pd.DataFrame(id_data)
identity["TransactionID"] = transactions.TransactionID
assert identity.shape[0] == transactions.shape[0]

identity = identity.loc[identity_transactions_idx]
identity.reset_index(drop=True, inplace=True)
identity = identity[["TransactionID", "IpAddress", "PhoneNo", "DeviceID"]]
identity = pd.DataFrame(id_data)


# join two tables for the convenience of generating label column 'isFraud'
full_two_df = transactions[["TransactionID", "card_no", "card_type", "email_domain", "ProductCD", "TransactionAmt"]].merge(identity, on='TransactionID', how='left')

is_fraud = []
for idx, row in full_two_df.iterrows():
card_no, card_type, email, product_type, transcation_amount, ip_address, phone_no, device_id = str(row["card_no"]), row["card_type"], row["email_domain"], row["ProductCD"], row["TransactionAmt"], str(row["IpAddress"]), str(row["PhoneNo"]), str(row["DeviceID"])

if email in ["hotmail.com", "gmail.com", "yahoo.com"]:
if product_type in ["45"]:
is_fraud.append(int(np.random.uniform() < 0.9))
else:
if (device_id != "nan") and (device_id.endswith("16") or device_id.endswith("78") or device_id.endswith("23")):
is_fraud.append(int(np.random.uniform() < 0.1))
else:
is_fraud.append(int(np.random.uniform() < 0.05))
else:
if transcation_amount > 3000:
is_fraud.append(int(np.random.uniform() < 0.8))
else:
if card_type in ["Diners Club / Carte Blanche", "JCB 15 digit", "Maestro"]: # about 35,000 observations are in this categires
if (card_no.endswith("001") or card_no.endswith("002") or card_no.endswith("003") or card_no.endswith("004") or card_no.endswith("005") or card_no.endswith("007") or card_no.endswith("008") or card_no.endswith("009")) or ((phone_no != "nan") and (phone_no.endswith(".227") or phone_no.endswith(".104") or phone_no.endswith(".251") or phone_no.endswith(".181"))):
is_fraud.append(int(np.random.uniform() < 0.3))
else:
if (ip_address != "nan") and (ip_address.endswith(".227") or ip_address.endswith(".104") or ip_address.endswith(".251") or ip_address.endswith(".181")):
is_fraud.append(int(np.random.uniform() < 0.2))
else:
is_fraud.append(int(np.random.uniform() < 0.1))
else:
is_fraud.append(int(np.random.uniform() < 0.0001))
print("fraud ratio", sum(is_fraud)/ len(is_fraud))

transactions['isFraud'] = is_fraud
return transactions, identity

if __name__ == '__main__':
transaction, identity = gen_fraud_data()
transaction.to_csv('raw_data/transaction.csv', index=False)
identity.to_csv('raw_data/identity.csv', index=False)
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import argparse
import logging
import os

import pandas as pd
import numpy as np
from itertools import combinations


def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--data-dir', type=str, default='/opt/ml/processing/input')
parser.add_argument('--output-dir', type=str, default='/opt/ml/processing/output')
parser.add_argument('--transactions', type=str, default='transaction.csv', help='name of file with transactions')
parser.add_argument('--identity', type=str, default='identity.csv', help='name of file with identity info')
parser.add_argument('--id-cols', type=str, default='', help='comma separated id cols in transactions table')
parser.add_argument('--cat-cols', type=str, default='', help='comma separated categorical cols in transactions')
parser.add_argument('--cat-cols-xgboost', type=str, default='', help='comma separated categorical cols that can be used as features for xgboost in transactions')
parser.add_argument('--train-data-ratio', type=float, default=0.7, help='fraction of data to use in training set')
parser.add_argument('--valid-data-ratio', type=float, default=0.2, help='fraction of data to use in validation set')
parser.add_argument('--construct-homogeneous', action="store_true", default=False,
help='use bipartite graphs edgelists to construct homogenous graph edgelist')
return parser.parse_args()


def get_logger(name):
logger = logging.getLogger(name)
log_format = '%(asctime)s %(levelname)s %(name)s: %(message)s'
logging.basicConfig(format=log_format, level=logging.INFO)
logger.setLevel(logging.INFO)
return logger


def load_data(data_dir, transaction_data, identity_data, train_data_ratio, valid_data_ratio, output_dir):
transaction_df = pd.read_csv(os.path.join(data_dir, transaction_data))
logging.info("Shape of transaction data is {}".format(transaction_df.shape))
logging.info("# Tagged transactions: {}".format(len(transaction_df) - transaction_df.isFraud.isnull().sum()))

identity_df = pd.read_csv(os.path.join(data_dir, identity_data))
logging.info("Shape of identity data is {}".format(identity_df.shape))

# extract out transactions for train, validation, and test data
logging.info("Training, validation, and test data fraction are {}, {}, and {}, respectively".format(train_data_ratio, valid_data_ratio, 1-train_data_ratio-valid_data_ratio))
assert train_data_ratio + valid_data_ratio < 1, "The sum of training and validation ratio is found more than or equal to 1."
n_train = int(transaction_df.shape[0]*train_data_ratio)
n_valid = int(transaction_df.shape[0]*(train_data_ratio+valid_data_ratio))
valid_ids = transaction_df.TransactionID.values[n_train:n_valid]
test_ids = transaction_df.TransactionID.values[n_valid:]

get_fraud_frac = lambda series: 100 * sum(series)/len(series)
logging.info("Percentage of fraud transactions for train data: {}".format(get_fraud_frac(transaction_df.isFraud[:n_train])))
logging.info("Percentage of fraud transactions for validation data: {}".format(get_fraud_frac(transaction_df.isFraud[n_train:n_valid])))
logging.info("Percentage of fraud transactions for test data: {}".format(get_fraud_frac(transaction_df.isFraud[n_valid:])))
logging.info("Percentage of fraud transactions for all data: {}".format(get_fraud_frac(transaction_df.isFraud)))

with open(os.path.join(output_dir, 'validation.csv'), 'w') as f:
f.writelines(map(lambda x: str(x) + "\n", valid_ids))
logging.info("Wrote validaton data to file: {}".format(os.path.join(output_dir, 'validation.csv')))

with open(os.path.join(output_dir, 'test.csv'), 'w') as f:
f.writelines(map(lambda x: str(x) + "\n", test_ids))
logging.info("Wrote test data to file: {}".format(os.path.join(output_dir, 'test.csv')))

return transaction_df, identity_df, valid_ids, test_ids


def get_features_and_labels(transactions_df, transactions_id_cols, transactions_cat_cols, transactions_cat_cols_xgboost, output_dir):
# Get features
non_feature_cols = ['isFraud', 'TransactionDT'] + transactions_id_cols.split(",")
feature_cols = [col for col in transactions_df.columns if col not in non_feature_cols]
logging.info("Categorical columns: {}".format(transactions_cat_cols.split(",")))
features = pd.get_dummies(transactions_df[feature_cols], columns=transactions_cat_cols.split(",")).fillna(0)
features['TransactionAmt'] = features['TransactionAmt'].apply(np.log10)
logging.info("Transformed feature columns: {}".format(list(features.columns)))
logging.info("Shape of features: {}".format(features.shape))
features.to_csv(os.path.join(output_dir, 'features.csv'), index=False, header=False)
logging.info("Wrote features to file: {}".format(os.path.join(output_dir, 'features.csv')))


logging.info("Processing feature columns for XGBoost.")
cat_cols_xgb = transactions_cat_cols_xgboost.split(",")
logging.info("Categorical feature columns for XGBoost: {}".format(cat_cols_xgb))
logging.info("Numerical feature column for XGBoost: 'TransactionAmt'")
features_xgb = pd.get_dummies(transactions_df[['TransactionID']+cat_cols_xgb], columns=cat_cols_xgb).fillna(0)
features_xgb['TransactionAmt'] = features['TransactionAmt']
features_xgb.to_csv(os.path.join(output_dir, 'features_xgboost.csv'), index=False, header=False)
logging.info("Wrote features to file: {}".format(os.path.join(output_dir, 'features_xgboost.csv')))

# Get labels
transactions_df[['TransactionID', 'isFraud']].to_csv(os.path.join(output_dir, 'tags.csv'), index=False)
logging.info("Wrote labels to file: {}".format(os.path.join(output_dir, 'tags.csv')))


def get_relations_and_edgelist(transactions_df, identity_df, transactions_id_cols, output_dir):
# Get relations
edge_types = transactions_id_cols.split(",") + list(identity_df.columns)
logging.info("Found the following distinct relation types: {}".format(edge_types))
id_cols = ['TransactionID'] + transactions_id_cols.split(",")
full_identity_df = transactions_df[id_cols].merge(identity_df, on='TransactionID', how='left')
logging.info("Shape of identity columns: {}".format(full_identity_df.shape))

# extract edges
edges = {}
for etype in edge_types:
edgelist = full_identity_df[['TransactionID', etype]].dropna()
edgelist.to_csv(os.path.join(output_dir, 'relation_{}_edgelist.csv').format(etype), index=False, header=True)
logging.info("Wrote edgelist to: {}".format(os.path.join(output_dir, 'relation_{}_edgelist.csv').format(etype)))
edges[etype] = edgelist
return edges


def create_homogeneous_edgelist(edges, output_dir):
homogeneous_edges = []
for etype, relations in edges.items():
for edge_relation, frame in relations.groupby(etype):
new_edges = [(a, b) for (a, b) in combinations(frame.TransactionID.values, 2)
if (a, b) not in homogeneous_edges and (b, a) not in homogeneous_edges]
homogeneous_edges.extend(new_edges)

with open(os.path.join(output_dir, 'homogeneous_edgelist.csv'), 'w') as f:
f.writelines(map(lambda x: "{}, {}\n".format(x[0], x[1]), homogeneous_edges))
logging.info("Wrote homogeneous edgelist to file: {}".format(os.path.join(output_dir, 'homogeneous_edgelist.csv')))


if __name__ == '__main__':
logging = get_logger(__name__)

args = parse_args()

transactions, identity, _, _ = load_data(args.data_dir,
args.transactions,
args.identity,
args.train_data_ratio,
args.valid_data_ratio,
args.output_dir)

get_features_and_labels(transactions, args.id_cols, args.cat_cols, args.cat_cols_xgboost, args.output_dir)
relational_edges = get_relations_and_edgelist(transactions, identity, args.id_cols, args.output_dir)

if args.construct_homogeneous:
create_homogeneous_edgelist(relational_edges, args.output_dir)



Loading

0 comments on commit 1038f9a

Please sign in to comment.