Training on artificial dataset. (#20)

* Jupyter Notebooks with data prep and training on the artificially-generated dataset. * Adjustment of prediction-service to new parameters.
polszewska · Apr 15, 2021 · dcb438c · dcb438c
1 parent 477eb15
commit dcb438c
Show file tree

Hide file tree

Showing 15 changed files with 8,063 additions and 72 deletions.
diff --git a/prediction-service/README.md b/prediction-service/README.md
@@ -45,34 +45,34 @@ curl -X 'POST' \
   -H 'Content-Type: application/json' \
   -d '{
   "customer": {
-    "customer_id": 4444,
-    "age_range": "46-55",
-    "marital_status": "Married",
-    "family_size": 5,
-    "no_of_children": 3,
-    "income_bracket": 3,
-    "gender": "M",
-    "mean_discount_used": -1.83,
-    "total_discount_used": -7548.79,
-    "total_unique_items_bought": 2040,
-    "total_quantity_bought": 4314,
-    "mean_quantity_bought": 1.04,
-    "mean_selling_price_paid": 61.27,
-    "total_coupons_redeemed": 220,
-    "total_price_paid": 253295.6
+    "customer_id": 54,
+    "age": "old",
+    "credit": 1,
+    "gender": "F",
+    "mean_product_price": 13.63,
+    "unique_coupons_used": 369,
+    "mean_discount_used": 11.93,
+    "unique_items_bought": 1934,
+    "total_items_bought": 42265
   },
   "coupons": [
     {
-      "coupon_id": 1111,
-      "item_selling_price": 70.88,
-      "coupon_discount": -35.62,
-      "item_category": "Men"
+      "coupon_id": 1,
+      "mean_item_selling_price": 7.06,
+      "coupon_discount": 50,
+      "category": "",
+      "how_many_products": 2,
+      "coupon_type": "buy_more",
+      "days_valid": 24
     },
     {
-      "coupon_id": 2323,
-      "item_selling_price": 75.51,
-      "coupon_discount": -26.71,
-      "item_category": "Sport"
+      "coupon_id": 2,
+      "mean_item_selling_price": 7.06,
+      "coupon_discount": 3.78,
+      "category": "",
+      "how_many_products": 2,
+      "coupon_type": "buy_all",
+      "days_valid": 20
     }
   ]
 }'
@@ -83,14 +83,14 @@ Example response:
 ```
 [
   {
-    "coupon_id": 1111,
-    "customer_id": 4444,
-    "prediction": 0.85
+    "coupon_id": 2,
+    "customer_id": 54,
+    "prediction": 0.0409353164
   },
   {
-    "coupon_id": 2323,
-    "customer_id": 4444,
-    "prediction": 0.8
+    "coupon_id": 1,
+    "customer_id": 54,
+    "prediction": 0.0311633506
   }
 ]
 ```

diff --git a/prediction-service/app/encoder.py b/prediction-service/app/encoder.py
@@ -1,44 +1,51 @@
 import pandas
 
-# from .model import PredictionInput
 from app.model import PredictionInput
 
 
 class DataEncoder:
-    _age_range = {'18-25': 0, '26-35': 1, '36-45': 2, '46-55': 3, '56-70': 4, '70+': 5}
-    _marital_status = {'Married': 0, 'Single': 1}
-    _gender = {'F': 0, 'M': 1}
+    _gender = ['F', 'M']
+    _age = ['young', 'mid', 'old']
     _categories = ['Boys', 'Girls', 'Men', 'Sports', 'Women']
+    _coupon_types = ['biy_all', 'boy_more', 'department', 'just_discount']
 
     @classmethod
     def encode(cls, input: PredictionInput) -> pandas.DataFrame:
         rows = []
         for coupon in input.coupons:
             row = {
                 'customer_id': input.customer.customer_id,
-                'age_range': cls._age_range[input.customer.age_range],
-                'marital_status': cls._marital_status[input.customer.marital_status],
-                'family_size': input.customer.family_size,
-                'no_of_children': input.customer.no_of_children,
-                'income_bracket': input.customer.income_bracket,
-                'gender': cls._gender[input.customer.gender],
-                'mean_discount_per_cust': input.customer.mean_discount_used,
-                'unique_items_per_cust': input.customer.total_unique_items_bought,
-                'mean_quantity_per_cust': input.customer.mean_quantity_bought,
-                'mean_selling_price_per_cust': input.customer.mean_selling_price_paid,
-                'total_discount_per_cust': input.customer.total_discount_used,
-                'total_coupons_used_per_cust': input.customer.total_coupons_redeemed,
-                'total_quantity_per_cust': input.customer.total_quantity_bought,
-                'total_selling_price_per_cust': input.customer.total_price_paid,
+                'cust_credit': input.customer.credit,
+                'cust_mean_product_price': input.customer.mean_product_price,
+                'cust_unique_coupons_used': input.customer.unique_coupons_used,
+                'cust_mean_discount': input.customer.mean_discount_used,
+                'cust_unique_products_bought': input.customer.unique_items_bought,
+                'cust_total_products_bougth': input.customer.total_items_bought,
                 'coupon_id': coupon.coupon_id,
                 'coupon_discount': coupon.coupon_discount,
-                'item_selling_price': coupon.item_selling_price
+                'coupon_how_many': coupon.how_many_products,
+                'coupon_days_valid': coupon.days_valid,
+                'coupon_mean_prod_price': coupon.mean_item_selling_price
             }
-            row.update(cls._encode_category(coupon.item_category))
+            # row.update(cls._encode_category(coupon.item_category))
+            row.update(cls._encode_age(input.customer.age))
+            row.update(cls._encode_gender(input.customer.gender))
+            row.update(cls._encode_coupon_type(coupon.coupon_type))
             rows.append(row)
-
         return pandas.DataFrame(rows)
 
     @classmethod
     def _encode_category(cls, category):
         return {f'category_{c}': 1 if category == c else 0 for c in cls._categories}
+
+    @classmethod
+    def _encode_age(cls, age):
+        return {f'cust_age_{a}': 1 if age == a else 0 for a in cls._age}
+
+    @classmethod
+    def _encode_gender(cls, gender):
+        return {f'cust_gender_{g}': 1 if gender == g else 0 for g in cls._gender}
+
+    @classmethod
+    def _encode_coupon_type(cls, coupon_type):
+        return {f'coupon_type_{t}': 1 if coupon_type == t else 0 for t in cls._coupon_types}
diff --git a/prediction-service/app/main.py b/prediction-service/app/main.py
@@ -15,11 +15,6 @@
 )
 
 
-@app.get('/')
-async def root():
-    return {'message': 'Hello World'}
-
-
 @app.post('/score', response_model=List[PredictionOutput])
 def score_coupon(
     input_data: PredictionInput,

diff --git a/prediction-service/app/model.py b/prediction-service/app/model.py
@@ -5,28 +5,24 @@
 
 class Coupon(BaseModel):
     coupon_id: int
-    item_selling_price: float
+    mean_item_selling_price: float
     coupon_discount: float
-    item_category: str  # TODO Enum
+    category: str  # TODO this is not used (for now)
+    how_many_products: int
+    coupon_type: str
+    days_valid: int
 
 
 class Customer(BaseModel):
     customer_id: int
-    age_range: str  # TODO Enum
-    marital_status: str # TODO Enum
-    family_size: int
-    no_of_children: int
-    income_bracket: int
+    age: str  # TODO Enum
+    credit: int
     gender: str  # TODO Enum
+    mean_product_price: float
+    unique_coupons_used: int
     mean_discount_used: float
-    total_discount_used: float
-    total_unique_items_bought: int
-    total_quantity_bought: int
-    mean_quantity_bought: float
-    mean_selling_price_paid: float
-    total_coupons_redeemed: int
-    total_price_paid: float
-
+    unique_items_bought: int
+    total_items_bought: int
 
 class PredictionInput(BaseModel):
     customer: Customer

diff --git a/prediction-service/app/model_store/pickled_model_gbm_no_balancing b/prediction-service/app/model_store/pickled_model_gbm_no_balancing
diff --git a/prediction-service/app/scorer.py b/prediction-service/app/scorer.py
@@ -14,11 +14,11 @@ def score(self, input_df: pandas.DataFrame) -> pandas.DataFrame:
         input_df.drop(['customer_id', 'coupon_id'], axis=1, inplace=True)
         probs = self._model.predict_proba(input_df)[:, 1]
         output_df['prediction'] = probs.round(decimals=10)
-        output_df.sort_values(by='prediction', ascending=False)
+        output_df.sort_values(by='prediction', ascending=False, inplace=True)
         return output_df
 
 
 def get_scorer():
-    model_path = 'app/model_store/scikit_classifier'
+    model_path = 'app/model_store/pickled_model_gbm_no_balancing'
     with open(model_path, 'rb') as f:
         return Scorer(pickle.load(f))
diff --git a/training-with-artificial-data/01_data_prep_v1.ipynb b/training-with-artificial-data/01_data_prep_v1.ipynb