Skip to content

Commit

Permalink
First submission.
Browse files Browse the repository at this point in the history
  • Loading branch information
steenrotsman committed Apr 22, 2024
1 parent b900c0c commit d81e5f9
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 29 deletions.
Binary file modified model.joblib
Binary file not shown.
78 changes: 62 additions & 16 deletions submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,16 @@
run.py can be used to test your submission.
"""

from os.path import join

import joblib
import pandas as pd

CODEBOOK_PATH = join("PreFer", "codebooks", "PreFer_codebook.csv")
KWARGS = {"encoding": "latin-1", "encoding_errors": "replace", "low_memory": False}
SURVEYS = ["cf", "ca", "cd", "ci", "ch", "cp", "gr", "cv", "he", "ma", "cr", "cs", "cw"]
WAVES = [f"{x:02}{chr(x+89)}" for x in range(8, 21)]


def clean_df(df, background_df=None):
"""
Expand All @@ -31,24 +38,52 @@ def clean_df(df, background_df=None):
Returns:
pd.DataFrame: The cleaned dataframe with only the necessary columns and processed variables.
"""
codebook = pd.read_csv(CODEBOOK_PATH, **KWARGS)

## This script contains a bare minimum working example
# Create new variable with age
df["age"] = 2024 - df["birthyear_bg"]
# Load metadata
meta = codebook[codebook["dataset"] == "PreFer_train_data.csv"]
meta_bg = codebook[
(codebook["dataset"] == "PreFer_train_background_data.csv")
| (codebook["var_name"] == "nomem_encr")
]
categorical_data = meta[meta["type_var"] == "categorical"]["var_name"].tolist()
categorical_bg = meta_bg[meta_bg["type_var"] == "categorical"]["var_name"].tolist()
numeric_data = meta[meta["type_var"] == "numeric"]["var_name"].tolist()
numeric_bg = meta_bg[meta_bg["type_var"] == "numeric"]["var_name"].tolist()

# Keep onnly categorical and numeric columns
df = df[categorical_data + numeric_data]
background_df = background_df[categorical_bg + numeric_bg]

# Pivot waves wide to long
df_long = df.melt(
"nomem_encr", [f"cf{wave}_m" for wave in WAVES], "wavecode", "wave"
)
df_long = df_long.dropna()
df_long["wavecode"] = df_long["wavecode"].str.extract(r"(\d\d\w)")
df_long["wave"] = df_long["wave"].astype("int")

# Imputing missing values in age with the mean
df["age"] = df["age"].fillna(df["age"].mean())
# Add background info
df_long = pd.merge(df_long, background_df, how="left", on=["nomem_encr", "wave"])

# Selecting variables for modelling
keepcols = [
"nomem_encr", # ID variable required for predictions,
"age", # newly created variable
]
# Pivot wave-specific variables wide to long
for code in SURVEYS:
# Question code xxx leaves room for 1000 questions
for i in range(1000):
value_vars = [
c for c in df.columns if c[:2] == code and c[-3:] == f"{i:03}"
]
if value_vars:
tmp = df.melt("nomem_encr", value_vars, "wavecode", f"{code}{i:03}")
tmp["wavecode"] = tmp["wavecode"].str.extract(r"(\d\d\w)")
df_long = pd.merge(
df_long, tmp, how="left", on=["nomem_encr", "wavecode"]
)

# Keeping data with variables selected
df = df[keepcols]
# Impute NAs with -1
df_long.fillna(-1)

return df
return df_long


def predict_outcomes(df, background_df=None, model_path="model.joblib"):
Expand Down Expand Up @@ -83,15 +118,26 @@ def predict_outcomes(df, background_df=None, model_path="model.joblib"):
df = clean_df(df, background_df)

# Exclude the variable nomem_encr if this variable is NOT in your model
vars_without_id = df.columns[df.columns != 'nomem_encr']
vars_without_id = df.drop(
["nomem_encr", "wavecode", "wave", "nohouse_encr"], axis=1
)

# Generate predictions from model, should be 0 (no child) or 1 (had child)
predictions = model.predict(df[vars_without_id])
# Generate probability predictions that individual had a child
predictions = model.predict_proba(vars_without_id)[:, 1]

# Output file should be DataFrame with two columns, nomem_encr and predictions
df_predict = pd.DataFrame(
{"nomem_encr": df["nomem_encr"], "prediction": predictions}
)

# Combine predictions for individual
df_predict = (
df_predict.groupby("nomem_encr")["prediction"]
.prod()
.round()
.astype(int)
.reset_index()
)

# Return only dataset with predictions and identifier
return df_predict
60 changes: 47 additions & 13 deletions training.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,30 @@
number of folds, model, et cetera
"""

import random
import argparse

import joblib
import pandas as pd
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

from submission import KWARGS, clean_df

parser = argparse.ArgumentParser(description="Train model.")
parser.add_argument("data_path", help="Path to data CSV file.")
parser.add_argument("background_data_path", help="Path to background data CSV file.")
parser.add_argument("ground_truth_path", help="Path to ground truth data CSV file.")
args = parser.parse_args()


def main():
df = pd.read_csv(args.data_path, **KWARGS)
bg = pd.read_csv(args.background_data_path, **KWARGS)
gt = pd.read_csv(args.ground_truth_path)

data = clean_df(df, bg)
train_save_model(data, gt)


def train_save_model(cleaned_df, outcome_df):
Expand All @@ -23,21 +42,36 @@ def train_save_model(cleaned_df, outcome_df):
cleaned_df (pd.DataFrame): The cleaned data from clean_df function to be used for training the model.
outcome_df (pd.DataFrame): The data with the outcome variable (e.g., from PreFer_train_outcome.csv or PreFer_fake_outcome.csv).
"""

## This script contains a bare minimum working example
random.seed(1) # not useful here because logistic regression deterministic

# Combine cleaned_df and outcome_df
model_df = pd.merge(cleaned_df, outcome_df, on="nomem_encr")

# Filter cases for whom the outcome is not available
model_df = model_df[~model_df['new_child'].isna()]
model_df = model_df[~model_df["new_child"].isna()]

# Logistic regression model
model = LogisticRegression()
# Split into X and y
X = model_df.drop(
["nomem_encr", "new_child", "wavecode", "wave", "nohouse_encr"], axis=1
)
y = model_df["new_child"]

# Fit the model
model.fit(model_df[['age']], model_df['new_child'])

# Save the model
# Classifier model
model = LGBMClassifier(verbose=-1, random_seed=123)
model.fit(X, y)
joblib.dump(model, "model.joblib")

# Get estimate of score
X1, X2, y1, y2 = train_test_split(X, y, test_size=0.5, stratify=y, random_state=123)

model.fit(X1, y1)
y_pred = model.predict(X2)
print(f1_score(y2, y_pred))

model.fit(X2, y2)
y_pred = model.predict(X1)
print(f1_score(y1, y_pred))
# 0.7675988428158148
# 0.7977422389463782


if __name__ == "__main__":
main()

0 comments on commit d81e5f9

Please sign in to comment.