Skip to content

Commit

Permalink
Training on artificial dataset. (#20)
Browse files Browse the repository at this point in the history
* Jupyter Notebooks with data prep and training on the artificially-generated dataset.
* Adjustment of prediction-service to new parameters.
  • Loading branch information
ugolowic authored Apr 15, 2021
1 parent 477eb15 commit dcb438c
Show file tree
Hide file tree
Showing 15 changed files with 8,063 additions and 72 deletions.
58 changes: 29 additions & 29 deletions prediction-service/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,34 +45,34 @@ curl -X 'POST' \
-H 'Content-Type: application/json' \
-d '{
"customer": {
"customer_id": 4444,
"age_range": "46-55",
"marital_status": "Married",
"family_size": 5,
"no_of_children": 3,
"income_bracket": 3,
"gender": "M",
"mean_discount_used": -1.83,
"total_discount_used": -7548.79,
"total_unique_items_bought": 2040,
"total_quantity_bought": 4314,
"mean_quantity_bought": 1.04,
"mean_selling_price_paid": 61.27,
"total_coupons_redeemed": 220,
"total_price_paid": 253295.6
"customer_id": 54,
"age": "old",
"credit": 1,
"gender": "F",
"mean_product_price": 13.63,
"unique_coupons_used": 369,
"mean_discount_used": 11.93,
"unique_items_bought": 1934,
"total_items_bought": 42265
},
"coupons": [
{
"coupon_id": 1111,
"item_selling_price": 70.88,
"coupon_discount": -35.62,
"item_category": "Men"
"coupon_id": 1,
"mean_item_selling_price": 7.06,
"coupon_discount": 50,
"category": "",
"how_many_products": 2,
"coupon_type": "buy_more",
"days_valid": 24
},
{
"coupon_id": 2323,
"item_selling_price": 75.51,
"coupon_discount": -26.71,
"item_category": "Sport"
"coupon_id": 2,
"mean_item_selling_price": 7.06,
"coupon_discount": 3.78,
"category": "",
"how_many_products": 2,
"coupon_type": "buy_all",
"days_valid": 20
}
]
}'
Expand All @@ -83,14 +83,14 @@ Example response:
```
[
{
"coupon_id": 1111,
"customer_id": 4444,
"prediction": 0.85
"coupon_id": 2,
"customer_id": 54,
"prediction": 0.0409353164
},
{
"coupon_id": 2323,
"customer_id": 4444,
"prediction": 0.8
"coupon_id": 1,
"customer_id": 54,
"prediction": 0.0311633506
}
]
```
Expand Down
49 changes: 28 additions & 21 deletions prediction-service/app/encoder.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,51 @@
import pandas

# from .model import PredictionInput
from app.model import PredictionInput


class DataEncoder:
_age_range = {'18-25': 0, '26-35': 1, '36-45': 2, '46-55': 3, '56-70': 4, '70+': 5}
_marital_status = {'Married': 0, 'Single': 1}
_gender = {'F': 0, 'M': 1}
_gender = ['F', 'M']
_age = ['young', 'mid', 'old']
_categories = ['Boys', 'Girls', 'Men', 'Sports', 'Women']
_coupon_types = ['biy_all', 'boy_more', 'department', 'just_discount']

@classmethod
def encode(cls, input: PredictionInput) -> pandas.DataFrame:
rows = []
for coupon in input.coupons:
row = {
'customer_id': input.customer.customer_id,
'age_range': cls._age_range[input.customer.age_range],
'marital_status': cls._marital_status[input.customer.marital_status],
'family_size': input.customer.family_size,
'no_of_children': input.customer.no_of_children,
'income_bracket': input.customer.income_bracket,
'gender': cls._gender[input.customer.gender],
'mean_discount_per_cust': input.customer.mean_discount_used,
'unique_items_per_cust': input.customer.total_unique_items_bought,
'mean_quantity_per_cust': input.customer.mean_quantity_bought,
'mean_selling_price_per_cust': input.customer.mean_selling_price_paid,
'total_discount_per_cust': input.customer.total_discount_used,
'total_coupons_used_per_cust': input.customer.total_coupons_redeemed,
'total_quantity_per_cust': input.customer.total_quantity_bought,
'total_selling_price_per_cust': input.customer.total_price_paid,
'cust_credit': input.customer.credit,
'cust_mean_product_price': input.customer.mean_product_price,
'cust_unique_coupons_used': input.customer.unique_coupons_used,
'cust_mean_discount': input.customer.mean_discount_used,
'cust_unique_products_bought': input.customer.unique_items_bought,
'cust_total_products_bougth': input.customer.total_items_bought,
'coupon_id': coupon.coupon_id,
'coupon_discount': coupon.coupon_discount,
'item_selling_price': coupon.item_selling_price
'coupon_how_many': coupon.how_many_products,
'coupon_days_valid': coupon.days_valid,
'coupon_mean_prod_price': coupon.mean_item_selling_price
}
row.update(cls._encode_category(coupon.item_category))
# row.update(cls._encode_category(coupon.item_category))
row.update(cls._encode_age(input.customer.age))
row.update(cls._encode_gender(input.customer.gender))
row.update(cls._encode_coupon_type(coupon.coupon_type))
rows.append(row)

return pandas.DataFrame(rows)

@classmethod
def _encode_category(cls, category):
return {f'category_{c}': 1 if category == c else 0 for c in cls._categories}

@classmethod
def _encode_age(cls, age):
return {f'cust_age_{a}': 1 if age == a else 0 for a in cls._age}

@classmethod
def _encode_gender(cls, gender):
return {f'cust_gender_{g}': 1 if gender == g else 0 for g in cls._gender}

@classmethod
def _encode_coupon_type(cls, coupon_type):
return {f'coupon_type_{t}': 1 if coupon_type == t else 0 for t in cls._coupon_types}
5 changes: 0 additions & 5 deletions prediction-service/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,6 @@
)


@app.get('/')
async def root():
return {'message': 'Hello World'}


@app.post('/score', response_model=List[PredictionOutput])
def score_coupon(
input_data: PredictionInput,
Expand Down
26 changes: 11 additions & 15 deletions prediction-service/app/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,24 @@

class Coupon(BaseModel):
coupon_id: int
item_selling_price: float
mean_item_selling_price: float
coupon_discount: float
item_category: str # TODO Enum
category: str # TODO this is not used (for now)
how_many_products: int
coupon_type: str
days_valid: int


class Customer(BaseModel):
customer_id: int
age_range: str # TODO Enum
marital_status: str # TODO Enum
family_size: int
no_of_children: int
income_bracket: int
age: str # TODO Enum
credit: int
gender: str # TODO Enum
mean_product_price: float
unique_coupons_used: int
mean_discount_used: float
total_discount_used: float
total_unique_items_bought: int
total_quantity_bought: int
mean_quantity_bought: float
mean_selling_price_paid: float
total_coupons_redeemed: int
total_price_paid: float

unique_items_bought: int
total_items_bought: int

class PredictionInput(BaseModel):
customer: Customer
Expand Down
Binary file not shown.
4 changes: 2 additions & 2 deletions prediction-service/app/scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ def score(self, input_df: pandas.DataFrame) -> pandas.DataFrame:
input_df.drop(['customer_id', 'coupon_id'], axis=1, inplace=True)
probs = self._model.predict_proba(input_df)[:, 1]
output_df['prediction'] = probs.round(decimals=10)
output_df.sort_values(by='prediction', ascending=False)
output_df.sort_values(by='prediction', ascending=False, inplace=True)
return output_df


def get_scorer():
model_path = 'app/model_store/scikit_classifier'
model_path = 'app/model_store/pickled_model_gbm_no_balancing'
with open(model_path, 'rb') as f:
return Scorer(pickle.load(f))
3,523 changes: 3,523 additions & 0 deletions training-with-artificial-data/01_data_prep_v1.ipynb

Large diffs are not rendered by default.

Loading

0 comments on commit dcb438c

Please sign in to comment.