From 2bac71d88801602fbe4018fbb3f33fe9adbbf4e2 Mon Sep 17 00:00:00 2001 From: ColtAllen Date: Mon, 11 Dec 2023 11:41:25 -0700 Subject: [PATCH] dev notebook added --- .../clv/dev/beta_geo_beta_binom.ipynb | 383 ++++++++++++++++++ 1 file changed, 383 insertions(+) create mode 100644 docs/source/notebooks/clv/dev/beta_geo_beta_binom.ipynb diff --git a/docs/source/notebooks/clv/dev/beta_geo_beta_binom.ipynb b/docs/source/notebooks/clv/dev/beta_geo_beta_binom.ipynb new file mode 100644 index 000000000..bc48cfa9c --- /dev/null +++ b/docs/source/notebooks/clv/dev/beta_geo_beta_binom.ipynb @@ -0,0 +1,383 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "5e06e043-4631-47ae-a658-a9a928ff15e5", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from lifetimes import BetaGeoBetaBinomFitter\n", + "from lifetimes.datasets import load_donations, load_cdnow_summary_data_with_monetary_value" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8ff42e71-fc43-4d45-8446-7205e3d37bce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ -3.94031398, -10.25427751, -6.82582822])" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "value = np.array([[1.5, 1], [5.3, 4], [6, 2]])\n", + "alpha = 0.55\n", + "beta = 10.58\n", + "gamma = 0.61\n", + "delta = 11.67\n", + "T = 12\n", + "\n", + "BetaGeoBetaBinomFitter._loglikelihood((alpha, beta, gamma, delta), value[..., 1], value[..., 0], T)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "371bb7a1-9f5c-4bdf-81b1-a9504261badb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "discrete_noncontract_df = load_donations()\n", + "\n", + "periods = 6\n", + "bgbb = BetaGeoBetaBinomFitter().fit(discrete_noncontract_df['frequency'].values,\n", + " discrete_noncontract_df['recency'].values,\n", + " discrete_noncontract_df['periods'].values,\n", + " discrete_noncontract_df['weights'].values)\n", + "bgbb" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f8052f01-5aca-48fd-917e-1f2bbeb6326f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['conditional_expected_number_of_purchases_up_to_time', 'conditional_probability_alive', 'expected_number_of_transactions_in_first_n_periods', 'fit', 'load_model', 'save_model', 'summary']\n" + ] + } + ], + "source": [ + "method_list = [method for method in dir(BetaGeoBetaBinomFitter) if not method.startswith('_')]\n", + "print(method_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f4f56b83-f830-4610-8f4f-e67ff242967e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 0.072863\n", + "1 0.085696\n", + "2 0.314238\n", + "3 0.593853\n", + "4 0.839396\n", + "5 1.021689\n", + "6 1.147885\n", + "7 0.119121\n", + "8 0.536111\n", + "9 1.057604\n", + "10 1.443042\n", + "11 1.668817\n", + "12 0.223595\n", + "13 1.034572\n", + "14 1.804703\n", + "15 2.189749\n", + "16 0.583192\n", + "17 2.030024\n", + "18 2.710681\n", + "19 1.812942\n", + "20 3.231612\n", + "21 3.752544\n", + "dtype: float64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# equation 13 in paper\n", + "bgbb.conditional_expected_number_of_purchases_up_to_time(5,\n", + " discrete_noncontract_df['frequency'],\n", + " discrete_noncontract_df['recency'],\n", + " discrete_noncontract_df['periods'])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7ea5dc42-160f-4f96-9e16-a97f01dd4bdc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 0.070072\n", + "1 0.045012\n", + "2 0.165056\n", + "3 0.311927\n", + "4 0.440900\n", + "5 0.536651\n", + "6 0.602936\n", + "7 0.043038\n", + "8 0.193695\n", + "9 0.382108\n", + "10 0.521365\n", + "11 0.602936\n", + "12 0.061566\n", + "13 0.284864\n", + "14 0.496916\n", + "15 0.602936\n", + "16 0.129719\n", + "17 0.451538\n", + "18 0.602936\n", + "19 0.338249\n", + "20 0.602936\n", + "21 0.602936\n", + "dtype: float64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# equation 11 in paper\n", + "bgbb.conditional_probability_alive(10,\n", + " discrete_noncontract_df['frequency'],\n", + " discrete_noncontract_df['recency'],\n", + " discrete_noncontract_df['periods'])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "96bcab46-7279-400a-82c1-e8b509ece774", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
model
frequency
03195.925987
11560.549020
2964.135361
3668.795916
4497.960966
5389.113685
6314.983874
\n", + "
" + ], + "text/plain": [ + " model\n", + "frequency \n", + "0 3195.925987\n", + "1 1560.549020\n", + "2 964.135361\n", + "3 668.795916\n", + "4 497.960966\n", + "5 389.113685\n", + "6 314.983874" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# TODO: write and test (8) as a replacement. Compare against just aggregating means across the exploded DF \n", + "# TODO: Can the arviz functions in the BetaGeoBetaBinom distribution block preclude the need for this?\n", + "# TODO: Replace this with (9) or (10) in a future PR, since that expression can predict interval ranges\n", + "\n", + "# equation 7 in paper, but that's for probabilities. should it be 8 for predicting mean n?\n", + "# yeah, this function should be renamed for clarity. \n", + "# it distributes customers in the dataset across n transaction opportunies\n", + "# it works better as an evaluation function, since it assumes a fixed customer population size\n", + "# if n > n_periods, it will keep right on predicting. This may be a bug\n", + "bgbb.expected_number_of_transactions_in_first_n_periods(n=50)" + ] + }, + { + "cell_type": "markdown", + "id": "9d55e986-d1f2-4c0d-8c25-3e289e90d5fe", + "metadata": {}, + "source": [ + "### Expected transactions in N periods\n", + "This expression will blow up to inf with large values of n (n=167 in this example). Recalculating on the log scale will allow for larger values, but this isn't possible if gamma < 1 because term1 will be negative." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2e82f5b4-1b4a-4477-843b-58cbd411d348", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "average of 1.938137499995133 purchases expected in 5 opportunities\n" + ] + } + ], + "source": [ + "from scipy import special\n", + "from numpy import log,exp\n", + "\n", + "n = 5\n", + "alpha,beta,delta,gamma = bgbb._unload_params('alpha','beta','delta','gamma')\n", + "\n", + "# add a larger gamma value for testing\n", + "#gamma = .9\n", + "\n", + "log_scale = False\n", + "\n", + "if not log_scale:\n", + " term1 = alpha/(alpha+beta)*delta/(gamma-1)\n", + " term2 = 1-(special.gamma(gamma+delta))/special.gamma(gamma+delta+n)*(special.gamma(1+delta+n))/special.gamma(1+delta)\n", + " expected_purchases_n_periods = term1 * term2\n", + "else:\n", + " term1 = log(alpha/(alpha+beta)) + log(delta/(gamma-1))\n", + " term2 = special.gammaln(gamma+delta) - special.gammaln(gamma+delta+n) + special.gammaln(1+delta+n) - special.gammaln(1+delta)\n", + " expected_purchases_n_periods = exp(term1) - exp(term2)\n", + "\n", + "print(f'average of {expected_purchases_n_periods} purchases expected in {n} opportunities')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "5186cf4d-710d-4e85-bef9-b66ccced5586", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[1.2035223936080357,\n", + " 0.7497163581757648,\n", + " 2.7834419828877737,\n", + " 0.6567181695499797]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bgbb._unload_params('alpha','beta','delta','gamma')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80d11cc8-98fb-426e-89b2-693f0a8d22fa", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}