First submission.

steenrotsman · Apr 22, 2024 · d81e5f9 · d81e5f9
1 parent b900c0c
commit d81e5f9
Show file tree

Hide file tree

Showing 3 changed files with 109 additions and 29 deletions.
diff --git a/model.joblib b/model.joblib
diff --git a/submission.py b/submission.py
@@ -15,9 +15,16 @@
 run.py can be used to test your submission.
 """
 
+from os.path import join
+
 import joblib
 import pandas as pd
 
+CODEBOOK_PATH = join("PreFer", "codebooks", "PreFer_codebook.csv")
+KWARGS = {"encoding": "latin-1", "encoding_errors": "replace", "low_memory": False}
+SURVEYS = ["cf", "ca", "cd", "ci", "ch", "cp", "gr", "cv", "he", "ma", "cr", "cs", "cw"]
+WAVES = [f"{x:02}{chr(x+89)}" for x in range(8, 21)]
+
 
 def clean_df(df, background_df=None):
     """
@@ -31,24 +38,52 @@ def clean_df(df, background_df=None):
     Returns:
     pd.DataFrame: The cleaned dataframe with only the necessary columns and processed variables.
     """
+    codebook = pd.read_csv(CODEBOOK_PATH, **KWARGS)
 
-    ## This script contains a bare minimum working example
-    # Create new variable with age
-    df["age"] = 2024 - df["birthyear_bg"]
+    # Load metadata
+    meta = codebook[codebook["dataset"] == "PreFer_train_data.csv"]
+    meta_bg = codebook[
+        (codebook["dataset"] == "PreFer_train_background_data.csv")
+        | (codebook["var_name"] == "nomem_encr")
+    ]
+    categorical_data = meta[meta["type_var"] == "categorical"]["var_name"].tolist()
+    categorical_bg = meta_bg[meta_bg["type_var"] == "categorical"]["var_name"].tolist()
+    numeric_data = meta[meta["type_var"] == "numeric"]["var_name"].tolist()
+    numeric_bg = meta_bg[meta_bg["type_var"] == "numeric"]["var_name"].tolist()
+
+    # Keep onnly categorical and numeric columns
+    df = df[categorical_data + numeric_data]
+    background_df = background_df[categorical_bg + numeric_bg]
+
+    # Pivot waves wide to long
+    df_long = df.melt(
+        "nomem_encr", [f"cf{wave}_m" for wave in WAVES], "wavecode", "wave"
+    )
+    df_long = df_long.dropna()
+    df_long["wavecode"] = df_long["wavecode"].str.extract(r"(\d\d\w)")
+    df_long["wave"] = df_long["wave"].astype("int")
 
-    # Imputing missing values in age with the mean
-    df["age"] = df["age"].fillna(df["age"].mean())
+    # Add background info
+    df_long = pd.merge(df_long, background_df, how="left", on=["nomem_encr", "wave"])
 
-    # Selecting variables for modelling
-    keepcols = [
-        "nomem_encr",  # ID variable required for predictions,
-        "age",  # newly created variable
-    ]
+    # Pivot wave-specific variables wide to long
+    for code in SURVEYS:
+        # Question code xxx leaves room for 1000 questions
+        for i in range(1000):
+            value_vars = [
+                c for c in df.columns if c[:2] == code and c[-3:] == f"{i:03}"
+            ]
+            if value_vars:
+                tmp = df.melt("nomem_encr", value_vars, "wavecode", f"{code}{i:03}")
+                tmp["wavecode"] = tmp["wavecode"].str.extract(r"(\d\d\w)")
+                df_long = pd.merge(
+                    df_long, tmp, how="left", on=["nomem_encr", "wavecode"]
+                )
 
-    # Keeping data with variables selected
-    df = df[keepcols]
+    # Impute NAs with -1
+    df_long.fillna(-1)
 
-    return df
+    return df_long
 
 
 def predict_outcomes(df, background_df=None, model_path="model.joblib"):
@@ -83,15 +118,26 @@ def predict_outcomes(df, background_df=None, model_path="model.joblib"):
     df = clean_df(df, background_df)
 
     # Exclude the variable nomem_encr if this variable is NOT in your model
-    vars_without_id = df.columns[df.columns != 'nomem_encr']
+    vars_without_id = df.drop(
+        ["nomem_encr", "wavecode", "wave", "nohouse_encr"], axis=1
+    )
 
-    # Generate predictions from model, should be 0 (no child) or 1 (had child)
-    predictions = model.predict(df[vars_without_id])
+    # Generate probability predictions that individual had a child
+    predictions = model.predict_proba(vars_without_id)[:, 1]
 
     # Output file should be DataFrame with two columns, nomem_encr and predictions
     df_predict = pd.DataFrame(
         {"nomem_encr": df["nomem_encr"], "prediction": predictions}
     )
 
+    # Combine predictions for individual
+    df_predict = (
+        df_predict.groupby("nomem_encr")["prediction"]
+        .prod()
+        .round()
+        .astype(int)
+        .reset_index()
+    )
+
     # Return only dataset with predictions and identifier
     return df_predict
diff --git a/training.py b/training.py
@@ -8,11 +8,30 @@
 number of folds, model, et cetera
 """
 
-import random
+import argparse
 
 import joblib
 import pandas as pd
-from sklearn.linear_model import LogisticRegression
+from lightgbm import LGBMClassifier
+from sklearn.metrics import f1_score
+from sklearn.model_selection import train_test_split
+
+from submission import KWARGS, clean_df
+
+parser = argparse.ArgumentParser(description="Train model.")
+parser.add_argument("data_path", help="Path to data CSV file.")
+parser.add_argument("background_data_path", help="Path to background data CSV file.")
+parser.add_argument("ground_truth_path", help="Path to ground truth data CSV file.")
+args = parser.parse_args()
+
+
+def main():
+    df = pd.read_csv(args.data_path, **KWARGS)
+    bg = pd.read_csv(args.background_data_path, **KWARGS)
+    gt = pd.read_csv(args.ground_truth_path)
+
+    data = clean_df(df, bg)
+    train_save_model(data, gt)
 
 
 def train_save_model(cleaned_df, outcome_df):
@@ -23,21 +42,36 @@ def train_save_model(cleaned_df, outcome_df):
     cleaned_df (pd.DataFrame): The cleaned data from clean_df function to be used for training the model.
     outcome_df (pd.DataFrame): The data with the outcome variable (e.g., from PreFer_train_outcome.csv or PreFer_fake_outcome.csv).
     """
-
-    ## This script contains a bare minimum working example
-    random.seed(1)  # not useful here because logistic regression deterministic
-
     # Combine cleaned_df and outcome_df
     model_df = pd.merge(cleaned_df, outcome_df, on="nomem_encr")
 
     # Filter cases for whom the outcome is not available
-    model_df = model_df[~model_df['new_child'].isna()]
+    model_df = model_df[~model_df["new_child"].isna()]
 
-    # Logistic regression model
-    model = LogisticRegression()
+    # Split into X and y
+    X = model_df.drop(
+        ["nomem_encr", "new_child", "wavecode", "wave", "nohouse_encr"], axis=1
+    )
+    y = model_df["new_child"]
 
-    # Fit the model
-    model.fit(model_df[['age']], model_df['new_child'])
-
-    # Save the model
+    # Classifier model
+    model = LGBMClassifier(verbose=-1, random_seed=123)
+    model.fit(X, y)
     joblib.dump(model, "model.joblib")
+
+    # Get estimate of score
+    X1, X2, y1, y2 = train_test_split(X, y, test_size=0.5, stratify=y, random_state=123)
+
+    model.fit(X1, y1)
+    y_pred = model.predict(X2)
+    print(f1_score(y2, y_pred))
+
+    model.fit(X2, y2)
+    y_pred = model.predict(X1)
+    print(f1_score(y1, y_pred))
+    # 0.7675988428158148
+    # 0.7977422389463782
+
+
+if __name__ == "__main__":
+    main()