From ed0c680b5abaf3a4a2fc7c74054cd55a022a17b9 Mon Sep 17 00:00:00 2001 From: Praneet Rathi Date: Tue, 27 Dec 2022 15:11:29 -0600 Subject: [PATCH 01/18] add monotone spline for 2d --- ogcore/txfunc.py | 446 +++++++++++++++++++++++++++++++------------ tests/test_txfunc.py | 78 ++++++-- 2 files changed, 390 insertions(+), 134 deletions(-) diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py index e26131351..827cb18f6 100644 --- a/ogcore/txfunc.py +++ b/ogcore/txfunc.py @@ -22,6 +22,11 @@ from ogcore.constants import DEFAULT_START_YEAR, SHOW_RUNTIME from ogcore import utils import warnings +from sklearn.ensemble import HistGradientBoostingRegressor +from ogcore.isotonic import MultiIsotonicRegressor +from pygam import LinearGAM, s, te +from matplotlib import cm +import random if not SHOW_RUNTIME: @@ -1632,6 +1637,53 @@ def tax_func_estimate( return dict_params +def avg_by_bin_2d(x, y, bins, weights = None): + """ + Args: + x (numpy array): 2d with dimensions n by m used for binning + y (numpy array): 1d with length n + bins (numpy array): 1d with length m, each entry must divide n + and is number of bins for corresponding column in x + weights (None or numpy array): 1d with length n specifying + weight of each observation. if None then array of ones + + Returns: + xNew (numpy array): 2d with second dimension m, first + dimension is product of elements in bins, with each entry + representative of bin across all the features + yNew (numpy array): 1d with length same as first dimension + of xWeight, weighted average of y's corresponding to each + entry of xWeight + weightsNew (numpy array): 1d with length same as yNew, weight + corresponding to each xNew, yNew row + """ + if x.shape[1] != len(bins): + message = "Dimensions of x and bins don't match: {} != {}".format(x.shape[1], len(bins)) + raise ValueError(message) + else: + size = np.prod(bins) + tupleBins = tuple(bins) + xNew = np.zeros((size, x.shape[1]), dtype=float) + yNew = np.zeros(size, dtype=float) + weightsNew = np.zeros(size, dtype=float) + + # iterate through each of the final bins, which consists of bins for each feature + # for each, only retain entries falling in that bin + for i in range(size): + index = list(np.unravel_index(i, tupleBins)) + valid = np.ones(x.shape[0], dtype=bool) + for j, v in enumerate(index): + valid &= (x[:, j] >= np.percentile(x[:, j], v*100/bins[j])) & (x[:, j] < np.percentile(x[:, j], (v+1)*100/bins[j])) + if np.sum(valid) != 0: + xNew[i, :] = np.average(x[valid], axis=0, weights=weights[valid]) + yNew[i] = np.average(y[valid], axis=0, weights=weights[valid]) + weightsNew[i] = np.sum(weights[valid]) + xNew = xNew[~(weightsNew == 0)] + yNew = yNew[~(weightsNew == 0)] + weightsNew = weightsNew[~(weightsNew == 0)] + return xNew, yNew, weightsNew + + def monotone_spline( x, y, @@ -1641,142 +1693,294 @@ def monotone_spline( kap=1e7, incl_uncstr=False, show_plot=False, + method='eilers', + splines=None, + plot_start=0, + plot_end=100 ): - # create binned and weighted x and y data - if bins: - if not np.isscalar(bins): - err_msg = "monotone_spline2 ERROR: bins value is not type scalar" - raise ValueError(err_msg) - N = bins - x_binned, y_binned, weights_binned = utils.avg_by_bin( - x, y, weights, bins - ) + """ + New args: + method (string): 'eilers' (old version) or 'pygam' (new version) + splines (None or array-like): for 'pygam' only (otherwise set None), + number of splines used for each feature, if None use default + plot_start/plot_end (number between 0, 100): for 'pygam' only if show_plot = True, + start and end for percentile of data used in plot, can result in + better visualizations if original data has strong outliers - elif not bins: - N = len(x) - x_binned = x - y_binned = y - weights_binned = weights - - # Prepare bases (Imat) and penalty - dd = 3 - E = np.eye(N) - D3 = np.diff(E, n=dd, axis=0) - D1 = np.diff(E, n=1, axis=0) - - # Monotone smoothing - ws = np.zeros(N - 1) - weights_binned = weights_binned.reshape(len(weights_binned), 1) - weights1 = 0.5 * weights_binned[1:, :] + 0.5 * weights_binned[:-1, :] - weights3 = ( - 0.25 * weights_binned[3:, :] - + 0.25 * weights_binned[2:-1, :] - + 0.25 * weights_binned[1:-2, :] - + 0.25 * weights_binned[:-3, :] - ).flatten() - - for it in range(30): - Ws = np.diag(ws * kap) - mon_cof = np.linalg.solve( - E - + lam * D3.T @ np.diag(weights3) @ D3 - + D1.T @ (Ws * weights1) @ D1, - y_binned, - ) - ws_new = (D1 @ mon_cof < 0.0) * 1 - dw = np.sum(ws != ws_new) - ws = ws_new - if dw == 0: - break - - # Monotonic and non monotonic fits - y_cstr = mon_cof - wsse_cstr = (weights_binned * ((y_cstr - y_binned) ** 2)).sum() - if incl_uncstr: - y_uncstr = np.linalg.solve( - E + lam * D3.T @ np.diag(weights3) @ D3, y_binned - ) - wsse_uncstr = (weights_binned * ((y_uncstr - y_binned) ** 2)).sum() - else: - y_uncstr = None - wsse_uncstr = None - - def mono_interp(x_vec): - # replace last point in data with two copies further out to make smooth - # extrapolation - x_new = np.append( - x_binned[:-1], [1.005 * x_binned[-1], 1.01 * x_binned[-1]] - ) - y_cstr_new = np.append(y_cstr[:-1], [y_cstr[-1], y_cstr[-1]]) - # Create interpolating cubic spline for interior points - inter_interpl = intp(x_new, y_cstr_new, kind="cubic") - y_pred = np.zeros_like(x_vec) - x_lt_min = x_vec < x_binned.min() - x_gt_max = x_vec > x_new.max() - x_inter = (x_vec >= x_binned.min()) & (x_vec <= x_new.max()) - y_pred[x_inter] = inter_interpl(x_vec[x_inter]) - # extrapolate the maximum for values above the maximum - y_pred[x_gt_max] = y_cstr[-1] - # linear extrapolation of last two points for values below the min - slope = (y_cstr[1] - y_cstr[0]) / (x_binned[1] - x_binned[0]) - intercept = y_cstr[0] - slope * x_binned[0] - y_pred[x_lt_min] = slope * x_vec[x_lt_min] + intercept - - return y_pred - - if show_plot: - plt.scatter( - x, - y, - linestyle="None", - color="gray", - s=0.8, - alpha=0.7, - label="All data", - ) - if not bins: - plt.plot( - x, - y_cstr, - color="red", - alpha=1.0, - label="Monotonic smooth spline", + Returns: + xNew (numpy array): 2d with second dimension m, first + dimension is product of elements in bins, with each entry + representative of bin across all the features + yNew (numpy array): 1d with length same as first dimension + of xWeight, weighted average of y's corresponding to each + entry of xWeight + weightsNew (numpy array): 1d with length same as yNew, weight + corresponding to each xNew, yNew row + """ + + if method == 'pygam': + if splines != None and len(splines) != x.shape[1]: + err_msg = ( + " pygam method requires splines to be None or " + + " same length as # of columns in x, " + len(splines) + + " != " + str(x.shape[1]) ) - if incl_uncstr: + raise ValueError(err_msg) + + # bin data + if bins == None: + x_binned, y_binned, weights_binned = x, y, weights + else: + x_binned, y_binned, weights_binned = avg_by_bin_2d(x, y, bins, weights) + + # setup pygam parameters- in addition to 's' spline terms, can also have 't' tensor + # terms which are interactions between two variables. 't' terms also need monotonic constraints + # to satisfy previous constraints, they actually impose stronger restriction + + if splines == None: + tempCstr = s(0, constraints='monotonic_inc') + for i in range(1, x_binned.shape[1]): + tempCstr += s(i, constraints='monotonic_inc') + tempUncstr = s(0) + for i in range(1, x_binned.shape[1]): + tempUncstr += s(i) + else: + tempCstr = s(0, constraints='monotonic_inc', n_splines=splines[0]) + for i in range(1, x_binned.shape[1]): + tempCstr += s(i, constraints='monotonic_inc', n_splines=splines[i]) + tempUncstr = s(0, n_splines=splines[0]) + for i in range(1, x_binned.shape[1]): + tempUncstr += s(i, n_splines=splines[i]) + + # fit data + gamCstr = LinearGAM(tempCstr).fit(x_binned, y_binned, weights_binned) + y_cstr = gamCstr.predict(x_binned) + wsse_cstr = (weights_binned * ((y_cstr - y_binned) ** 2)).sum() + if incl_uncstr: + gamUncstr = LinearGAM(tempUncstr).fit(x_binned, y_binned, weights_binned) + y_uncstr = gamUncstr.predict(x_binned) + wsse_uncstr = (weights_binned * ((y_uncstr - y_binned) ** 2)).sum() + else: + y_uncstr = None + wsse_uncstr = None + print("Constrained: " + str(wsse_cstr)) + print("Unconstrained: " + str(wsse_uncstr)) + + if show_plot: + if x.shape[1] == 2: + fig, ax = plt.subplots(subplot_kw={"projection": "3d"}) + # select data in [plot_start, end] percentile across both features + # this can be rewritten to generalize for n-dimensions, but didn't know how to plot that + xactPlot = x[(x[:,0] >= np.percentile(x[:,0], plot_start)) & (x[:,0] <= np.percentile(x[:,0], plot_end)) & + (x[:,1] >= np.percentile(x[:,1], plot_start)) & (x[:,1] <= np.percentile(x[:,1], plot_end))] + yactPlot = y[(x[:,0] >= np.percentile(x[:,0], plot_start)) & (x[:,0] <= np.percentile(x[:,0], plot_end)) & + (x[:,1] >= np.percentile(x[:,1], plot_start)) & (x[:,1] <= np.percentile(x[:,1], plot_end))] + ax.scatter(xactPlot[:, 0], xactPlot[:, 1], yactPlot, color="black", s=0.8, alpha=0.25) + + x0 = np.linspace(np.percentile(x[:,0], plot_start),np.percentile(x[:,0], plot_end), 1000) + x1 = np.linspace(np.percentile(x[:,1], plot_start),np.percentile(x[:,1], plot_end), 1000) + X0, X1 = np.meshgrid(x0, x1) + yPred = gamCstr.predict(np.array([X0.flatten(), X1.flatten()]).T) + ax.plot_surface(X0, X1, yPred.reshape(x0.shape[0], -1), color = 'red') + ax.set_label("Monotonic GAM spline with all data") + + if x.shape[1] == 1: + plt.scatter( + x, + y, + linestyle="None", + color="gray", + s=0.8, + alpha=0.7, + label="All data", + ) plt.plot( x, - y_uncstr, - color="blue", + y_cstr, + color="red", alpha=1.0, - label="Unconstrained smooth spline", + label="Monotonic GAM spline", ) + if incl_uncstr: + plt.plot( + x, + y_uncstr, + color="blue", + alpha=1.0, + label="Unconstrained GAM spline", + ) + plt.show() + plt.close() + + def interp(x): + return gamCstr.predict(x) + + return interp, y_cstr, wsse_cstr, y_uncstr, wsse_uncstr + + if method == 'eilers': + # create binned and weighted x and y data + if bins: + if not np.isscalar(bins): + err_msg = "monotone_spline2 ERROR: bins value is not type scalar" + raise ValueError(err_msg) + N = bins + x_binned, y_binned, weights_binned_ = utils.avg_by_bin( + x, y, weights, bins + ) + + elif not bins: + N = len(x) + x_binned = x + y_binned = y + weights_binned_ = weights + + # Prepare bases (Imat) and penalty + dd = 3 + E = np.eye(N) + D3 = np.diff(E, n=dd, axis=0) + D1 = np.diff(E, n=1, axis=0) + + # Monotone smoothing + ws = np.zeros(N - 1) + weights_binned = weights_binned_.reshape(len(weights_binned_), 1) + weights1 = 0.5 * weights_binned[1:, :] + 0.5 * weights_binned[:-1, :] + weights3 = ( + 0.25 * weights_binned[3:, :] + + 0.25 * weights_binned[2:-1, :] + + 0.25 * weights_binned[1:-2, :] + + 0.25 * weights_binned[:-3, :] + ).flatten() + + for it in range(30): + Ws = np.diag(ws * kap) + mon_cof = np.linalg.solve( + E + + lam * D3.T @ np.diag(weights3) @ D3 # weights3 + + D1.T @ (Ws * weights1) @ D1, + y_binned, + ) + print(ws) + print(mon_cof) + ws_new = (D1 @ mon_cof < 0.0) * 1 + dw = np.sum(ws != ws_new) + ws = ws_new + if dw == 0: + break + + # Monotonic and non monotonic fits + y_cstr = mon_cof + wsse_cstr = (weights_binned * ((y_cstr - y_binned) ** 2)).sum() + if incl_uncstr: + y_uncstr = np.linalg.solve( + E + lam * D3.T @ np.diag(weights3) @ D3, y_binned # weights3 + ) + wsse_uncstr = (weights_binned * ((y_uncstr - y_binned) ** 2)).sum() else: + y_uncstr = None + wsse_uncstr = None + + # breakpoint() + # hist = HistGradientBoostingRegressor(max_depth = 1).fit( + # np.array([x_binned]).T, y_binned, weights_binned_) + # y_cstr2 = hist.predict(np.array([x_binned]).T) + # wsse_cstr2 = (weights_binned * ((y_cstr2 - y_binned) ** 2)).sum() + # iso = MultiIsotonicRegressor() + # iso = iso.fit(np.array([x_binned]).T, y_binned) + # y_cstr2 = iso.predict(np.array([x_binned]).T) + # wsse_cstr2 = (weights_binned * ((y_cstr2 - y_binned) ** 2)).sum() + + + + print("Constrained: " + str(wsse_cstr)) + # print("Constrained2: " + str(wsse_cstr2)) + print("unconstrained: " + str(wsse_uncstr)) + + def mono_interp(x_vec, y_cstr_): + # replace last point in data with two copies further out to make smooth + # extrapolation + x_new = np.append( + x_binned[:-1], [1.005 * x_binned[-1], 1.01 * x_binned[-1]] + ) + y_cstr_new = np.append(y_cstr_[:-1], [y_cstr_[-1], y_cstr_[-1]]) + # Create interpolating cubic spline for interior points + inter_interpl = intp(x_new, y_cstr_new, kind="cubic") + y_pred = np.zeros_like(x_vec) + x_lt_min = x_vec < x_binned.min() + x_gt_max = x_vec > x_new.max() + x_inter = (x_vec >= x_binned.min()) & (x_vec <= x_new.max()) + y_pred[x_inter] = inter_interpl(x_vec[x_inter]) + # extrapolate the maximum for values above the maximum + y_pred[x_gt_max] = y_cstr_[-1] + # linear extrapolation of last two points for values below the min + slope = (y_cstr_[1] - y_cstr_[0]) / (x_binned[1] - x_binned[0]) + intercept = y_cstr_[0] - slope * x_binned[0] + y_pred[x_lt_min] = slope * x_vec[x_lt_min] + intercept + + return y_pred + + if show_plot: plt.scatter( - x_binned, - y_binned, + x, + y, linestyle="None", - color="black", + color="gray", s=0.8, alpha=0.7, - label="Binned data averages", + label="All data", ) - plt.plot( - x, - mono_interp(x), - color="red", - alpha=1.0, - label="Monotonic smooth spline", - ) - if incl_uncstr: + if not bins: plt.plot( + x, + y_cstr, + color="red", + alpha=1.0, + label="Monotonic smooth spline", + ) + if incl_uncstr: + plt.plot( + x, + y_uncstr, + color="blue", + alpha=1.0, + label="Unconstrained smooth spline", + ) + else: + plt.scatter( x_binned, - y_uncstr, - color="blue", + y_binned, + linestyle="None", + color="black", + s=4, + alpha=0.7, + label="Binned data averages", + ) + plt.plot( + x, + mono_interp(x, y_cstr), + color="red", alpha=1.0, - label="Unconstrained smooth spline", + label="Original monotonic smooth spline", ) - plt.legend(loc="lower right") - plt.show() - plt.close() + plt.plot( + x, + ygam, + color="green", + alpha=1.0, + label="New monotonic smooth spline", + ) + if incl_uncstr: + plt.plot( + x_binned, + y_uncstr, + color="blue", + alpha=1.0, + label="Unconstrained smooth spline", + ) + plt.legend(loc="lower right") + plt.show() + plt.close() + + return mono_interp, y_cstr, wsse_cstr, y_uncstr, wsse_uncstr - return mono_interp, y_cstr, wsse_cstr, y_uncstr, wsse_uncstr + err_msg = method + " method not supported, must be 'eilers' or 'pygam'" + raise ValueError(err_msg) diff --git a/tests/test_txfunc.py b/tests/test_txfunc.py index 578cbb0c2..41278b302 100644 --- a/tests/test_txfunc.py +++ b/tests/test_txfunc.py @@ -1,3 +1,5 @@ +import sys +sys.path.append('/Users/praneet/developer/OG-Core/') from ogcore import txfunc from distributed import Client, LocalCluster import pytest @@ -731,6 +733,56 @@ def test_monotone_spline(): """ # Simulate some data np.random.seed(10) + + # another test case for pygam method: + + # N = 100 + # xlo = 0.001 + # xhi = 2 * np.pi + # x1 = np.arange(xlo, xhi, step=(xhi - xlo) / N) + # x2 = np.arange(xlo, xhi, step=(xhi - xlo) / N) + # x = np.array([x1, x2]).T + # X1, X2 = np.meshgrid(x1, x2) + # X1, X2 = X1.flatten(), X2.flatten() + # X = np.zeros((X1.shape[0], 2)) + # X[:,0] = X1 + # X[:,1] = X2 + # y0 = np.exp(np.sin(X1)) * np.exp(np.cos(X2)) + # y0 = np.cos(X1)/(1.01 + np.sin(X2)) + # y0 = 1/(1 + np.exp(-(X1+X2))) + # y = y0 + np.random.random(y0.shape) * 0.2 + # weights = np.ones(10000) + # ( + # mono_interp, + # y_cstr, + # wsse_cstr, + # y_uncstr, + # wsse_uncstr, + # ) = txfunc.monotone_spline( + # X, y, weights, lam=100, incl_uncstr=True, show_plot=True, method='pygam' + # ) + + with open('test_io_data/micro_data_dict_for_tests.pkl', 'rb') as f: + data = pickle.load(f)['2030'] + df = data[['total_labinc', 'total_capinc', 'etr', 'weight']] + df.replace([np.inf, -np.inf], np.nan, inplace=True) + df.dropna(inplace=True) + + # Estimate monotonically increasing function on the data + ( + mono_interp1, + y_cstr1, + wsse_cstr1, + y_uncstr1, + wsse_uncstr1, + ) = txfunc.monotone_spline( + df[['total_labinc', 'total_capinc']].values, df['etr'].values, df['weight'].values, bins=[100, 100], lam=100, incl_uncstr=True, + show_plot=True, method='pygam', splines=[100, 100], plot_cutoff=True, plot_start=25, plot_end=75 + ) + # Test whether mono_interp is a function + assert hasattr(mono_interp1, "__call__") + + # Simulate some data N = 100 xlo = 0.001 xhi = 2 * np.pi @@ -739,23 +791,22 @@ def test_monotone_spline(): y = y0 + np.random.randn(N) * 0.5 weights = np.ones(N) - # Estimate monotonically increasing function on the data ( - mono_interp, - y_cstr, - wsse_cstr, - y_uncstr, - wsse_uncstr, + mono_interp2, + y_cstr2, + wsse_cstr2, + y_uncstr2, + wsse_uncstr2, ) = txfunc.monotone_spline( - x, y, weights, bins=10, lam=100, incl_uncstr=True, show_plot=False + x, y, weights, bins=10, lam=100, incl_uncstr=True, show_plot=False, method='eilers' ) # Test whether mono_interp is a function - assert hasattr(mono_interp, "__call__") + assert hasattr(mono_interp2, "__call__") # Test whether mono_interp gives the correct output x_vec_test = np.array([2.0, 5.0]) y_vec_expected = np.array([0.69512331, 1.23822669]) - y_monointerp = mono_interp(x_vec_test) + y_monointerp = mono_interp2(x_vec_test, y_cstr2) assert np.allclose(y_monointerp, y_vec_expected) # Test that y_cstr, wsse_cstr, y_uncstr, wsse_uncstr are correct @@ -789,7 +840,8 @@ def test_monotone_spline(): ] ) wsse_uncstr_expected = 468.4894930349361 - assert np.allclose(y_cstr, y_cstr_expected) - assert np.allclose(wsse_cstr, wsse_cstr_expected) - assert np.allclose(y_uncstr, y_uncstr_expected) - assert np.allclose(wsse_uncstr, wsse_uncstr_expected) + assert np.allclose(y_cstr2, y_cstr_expected) + assert np.allclose(wsse_cstr2, wsse_cstr_expected) + assert np.allclose(y_uncstr2, y_uncstr_expected) + assert np.allclose(wsse_uncstr2, wsse_uncstr_expected) + From 7efa261b57b74d29db438f3f5224cbb936b8a480 Mon Sep 17 00:00:00 2001 From: prrathi <53785742+prrathi@users.noreply.github.com> Date: Tue, 27 Dec 2022 15:24:24 -0600 Subject: [PATCH 02/18] restore original eilers method code --- ogcore/txfunc.py | 53 +++++++++++++----------------------------------- 1 file changed, 14 insertions(+), 39 deletions(-) diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py index 827cb18f6..b84a53dc5 100644 --- a/ogcore/txfunc.py +++ b/ogcore/txfunc.py @@ -1824,7 +1824,7 @@ def interp(x): err_msg = "monotone_spline2 ERROR: bins value is not type scalar" raise ValueError(err_msg) N = bins - x_binned, y_binned, weights_binned_ = utils.avg_by_bin( + x_binned, y_binned, weights_binned = utils.avg_by_bin( x, y, weights, bins ) @@ -1832,7 +1832,7 @@ def interp(x): N = len(x) x_binned = x y_binned = y - weights_binned_ = weights + weights_binned = weights # Prepare bases (Imat) and penalty dd = 3 @@ -1842,7 +1842,7 @@ def interp(x): # Monotone smoothing ws = np.zeros(N - 1) - weights_binned = weights_binned_.reshape(len(weights_binned_), 1) + weights_binned = weights_binned.reshape(len(weights_binned), 1) weights1 = 0.5 * weights_binned[1:, :] + 0.5 * weights_binned[:-1, :] weights3 = ( 0.25 * weights_binned[3:, :] @@ -1854,13 +1854,11 @@ def interp(x): for it in range(30): Ws = np.diag(ws * kap) mon_cof = np.linalg.solve( - E + - lam * D3.T @ np.diag(weights3) @ D3 # weights3 + E + + lam * D3.T @ np.diag(weights3) @ D3 + D1.T @ (Ws * weights1) @ D1, y_binned, ) - print(ws) - print(mon_cof) ws_new = (D1 @ mon_cof < 0.0) * 1 dw = np.sum(ws != ws_new) ws = ws_new @@ -1872,36 +1870,20 @@ def interp(x): wsse_cstr = (weights_binned * ((y_cstr - y_binned) ** 2)).sum() if incl_uncstr: y_uncstr = np.linalg.solve( - E + lam * D3.T @ np.diag(weights3) @ D3, y_binned # weights3 + E + lam * D3.T @ np.diag(weights3) @ D3, y_binned ) wsse_uncstr = (weights_binned * ((y_uncstr - y_binned) ** 2)).sum() else: y_uncstr = None wsse_uncstr = None - # breakpoint() - # hist = HistGradientBoostingRegressor(max_depth = 1).fit( - # np.array([x_binned]).T, y_binned, weights_binned_) - # y_cstr2 = hist.predict(np.array([x_binned]).T) - # wsse_cstr2 = (weights_binned * ((y_cstr2 - y_binned) ** 2)).sum() - # iso = MultiIsotonicRegressor() - # iso = iso.fit(np.array([x_binned]).T, y_binned) - # y_cstr2 = iso.predict(np.array([x_binned]).T) - # wsse_cstr2 = (weights_binned * ((y_cstr2 - y_binned) ** 2)).sum() - - - - print("Constrained: " + str(wsse_cstr)) - # print("Constrained2: " + str(wsse_cstr2)) - print("unconstrained: " + str(wsse_uncstr)) - - def mono_interp(x_vec, y_cstr_): + def mono_interp(x_vec): # replace last point in data with two copies further out to make smooth # extrapolation x_new = np.append( x_binned[:-1], [1.005 * x_binned[-1], 1.01 * x_binned[-1]] ) - y_cstr_new = np.append(y_cstr_[:-1], [y_cstr_[-1], y_cstr_[-1]]) + y_cstr_new = np.append(y_cstr[:-1], [y_cstr[-1], y_cstr[-1]]) # Create interpolating cubic spline for interior points inter_interpl = intp(x_new, y_cstr_new, kind="cubic") y_pred = np.zeros_like(x_vec) @@ -1910,10 +1892,10 @@ def mono_interp(x_vec, y_cstr_): x_inter = (x_vec >= x_binned.min()) & (x_vec <= x_new.max()) y_pred[x_inter] = inter_interpl(x_vec[x_inter]) # extrapolate the maximum for values above the maximum - y_pred[x_gt_max] = y_cstr_[-1] + y_pred[x_gt_max] = y_cstr[-1] # linear extrapolation of last two points for values below the min - slope = (y_cstr_[1] - y_cstr_[0]) / (x_binned[1] - x_binned[0]) - intercept = y_cstr_[0] - slope * x_binned[0] + slope = (y_cstr[1] - y_cstr[0]) / (x_binned[1] - x_binned[0]) + intercept = y_cstr[0] - slope * x_binned[0] y_pred[x_lt_min] = slope * x_vec[x_lt_min] + intercept return y_pred @@ -1950,23 +1932,16 @@ def mono_interp(x_vec, y_cstr_): y_binned, linestyle="None", color="black", - s=4, + s=0.8, alpha=0.7, label="Binned data averages", ) plt.plot( x, - mono_interp(x, y_cstr), + mono_interp(x), color="red", alpha=1.0, - label="Original monotonic smooth spline", - ) - plt.plot( - x, - ygam, - color="green", - alpha=1.0, - label="New monotonic smooth spline", + label="Monotonic smooth spline", ) if incl_uncstr: plt.plot( From 7f3a8652e2fc0d99900c272ae7d2c8599022028e Mon Sep 17 00:00:00 2001 From: prrathi <53785742+prrathi@users.noreply.github.com> Date: Tue, 27 Dec 2022 16:14:59 -0600 Subject: [PATCH 03/18] Update txfunc.py --- ogcore/txfunc.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py index b84a53dc5..57f25808d 100644 --- a/ogcore/txfunc.py +++ b/ogcore/txfunc.py @@ -22,8 +22,6 @@ from ogcore.constants import DEFAULT_START_YEAR, SHOW_RUNTIME from ogcore import utils import warnings -from sklearn.ensemble import HistGradientBoostingRegressor -from ogcore.isotonic import MultiIsotonicRegressor from pygam import LinearGAM, s, te from matplotlib import cm import random From 0c9c48ec9b48c12dff9d7b1e9e8cd04fded3ae52 Mon Sep 17 00:00:00 2001 From: Praneet Rathi Date: Wed, 28 Dec 2022 09:20:58 -0600 Subject: [PATCH 04/18] update environment + other small changes --- tests/test_txfunc.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_txfunc.py b/tests/test_txfunc.py index 41278b302..bfdb18378 100644 --- a/tests/test_txfunc.py +++ b/tests/test_txfunc.py @@ -777,10 +777,11 @@ def test_monotone_spline(): wsse_uncstr1, ) = txfunc.monotone_spline( df[['total_labinc', 'total_capinc']].values, df['etr'].values, df['weight'].values, bins=[100, 100], lam=100, incl_uncstr=True, - show_plot=True, method='pygam', splines=[100, 100], plot_cutoff=True, plot_start=25, plot_end=75 + show_plot=False, method='pygam', splines=[100, 100], plot_start=25, plot_end=75 ) # Test whether mono_interp is a function assert hasattr(mono_interp1, "__call__") + # Not sure what baseline should be to add tests here for "correctness" of values # Simulate some data N = 100 @@ -806,7 +807,7 @@ def test_monotone_spline(): # Test whether mono_interp gives the correct output x_vec_test = np.array([2.0, 5.0]) y_vec_expected = np.array([0.69512331, 1.23822669]) - y_monointerp = mono_interp2(x_vec_test, y_cstr2) + y_monointerp = mono_interp2(x_vec_test) assert np.allclose(y_monointerp, y_vec_expected) # Test that y_cstr, wsse_cstr, y_uncstr, wsse_uncstr are correct From 1857336d49cd86c39f7bae25b1e5421e7487a249 Mon Sep 17 00:00:00 2001 From: Praneet Rathi Date: Wed, 28 Dec 2022 09:21:58 -0600 Subject: [PATCH 05/18] uncommitted files --- environment.yml | 5 ++++- ogcore/txfunc.py | 6 ++---- setup.py | 2 ++ 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/environment.yml b/environment.yml index 14c4ad5ba..0f9c2f0f2 100644 --- a/environment.yml +++ b/environment.yml @@ -20,4 +20,7 @@ dependencies: - coverage - requests - openpyxl -- black \ No newline at end of file +- black +- pip +- pip: + - pygam \ No newline at end of file diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py index 57f25808d..43eda1e02 100644 --- a/ogcore/txfunc.py +++ b/ogcore/txfunc.py @@ -1635,7 +1635,7 @@ def tax_func_estimate( return dict_params -def avg_by_bin_2d(x, y, bins, weights = None): +def avg_by_bin_multd(x, y, bins, weights = None): """ Args: x (numpy array): 2d with dimensions n by m used for binning @@ -1729,7 +1729,7 @@ def monotone_spline( if bins == None: x_binned, y_binned, weights_binned = x, y, weights else: - x_binned, y_binned, weights_binned = avg_by_bin_2d(x, y, bins, weights) + x_binned, y_binned, weights_binned = avg_by_bin_multd(x, y, bins, weights) # setup pygam parameters- in addition to 's' spline terms, can also have 't' tensor # terms which are interactions between two variables. 't' terms also need monotonic constraints @@ -1761,8 +1761,6 @@ def monotone_spline( else: y_uncstr = None wsse_uncstr = None - print("Constrained: " + str(wsse_cstr)) - print("Unconstrained: " + str(wsse_uncstr)) if show_plot: if x.shape[1] == 2: diff --git a/setup.py b/setup.py index 2ed7fdfbb..7d133dead 100755 --- a/setup.py +++ b/setup.py @@ -34,6 +34,8 @@ "distributed>=2.30.1", "paramtools>=0.15.0", "requests", + "pip", + "pygam", ], classifiers=[ "Development Status :: 2 - Pre-Alpha", From 9193e288e7c84c4e98e314f8a349daf3de5236c1 Mon Sep 17 00:00:00 2001 From: Praneet Rathi Date: Thu, 29 Dec 2022 10:52:16 -0600 Subject: [PATCH 06/18] black formatted files --- ogcore/txfunc.py | 123 +++++++++++++++++++++++++++++-------------- tests/test_txfunc.py | 46 ++++++++++------ 2 files changed, 114 insertions(+), 55 deletions(-) diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py index 43eda1e02..72114c0ec 100644 --- a/ogcore/txfunc.py +++ b/ogcore/txfunc.py @@ -1635,28 +1635,30 @@ def tax_func_estimate( return dict_params -def avg_by_bin_multd(x, y, bins, weights = None): +def avg_by_bin_multd(x, y, bins, weights=None): """ Args: x (numpy array): 2d with dimensions n by m used for binning - y (numpy array): 1d with length n + y (numpy array): 1d with length n bins (numpy array): 1d with length m, each entry must divide n and is number of bins for corresponding column in x - weights (None or numpy array): 1d with length n specifying + weights (None or numpy array): 1d with length n specifying weight of each observation. if None then array of ones Returns: - xNew (numpy array): 2d with second dimension m, first - dimension is product of elements in bins, with each entry + xNew (numpy array): 2d with second dimension m, first + dimension is product of elements in bins, with each entry representative of bin across all the features yNew (numpy array): 1d with length same as first dimension - of xWeight, weighted average of y's corresponding to each + of xWeight, weighted average of y's corresponding to each entry of xWeight weightsNew (numpy array): 1d with length same as yNew, weight corresponding to each xNew, yNew row """ if x.shape[1] != len(bins): - message = "Dimensions of x and bins don't match: {} != {}".format(x.shape[1], len(bins)) + message = "Dimensions of x and bins don't match: {} != {}".format( + x.shape[1], len(bins) + ) raise ValueError(message) else: size = np.prod(bins) @@ -1671,9 +1673,13 @@ def avg_by_bin_multd(x, y, bins, weights = None): index = list(np.unravel_index(i, tupleBins)) valid = np.ones(x.shape[0], dtype=bool) for j, v in enumerate(index): - valid &= (x[:, j] >= np.percentile(x[:, j], v*100/bins[j])) & (x[:, j] < np.percentile(x[:, j], (v+1)*100/bins[j])) + valid &= ( + x[:, j] >= np.percentile(x[:, j], v * 100 / bins[j]) + ) & (x[:, j] < np.percentile(x[:, j], (v + 1) * 100 / bins[j])) if np.sum(valid) != 0: - xNew[i, :] = np.average(x[valid], axis=0, weights=weights[valid]) + xNew[i, :] = np.average( + x[valid], axis=0, weights=weights[valid] + ) yNew[i] = np.average(y[valid], axis=0, weights=weights[valid]) weightsNew[i] = np.sum(weights[valid]) xNew = xNew[~(weightsNew == 0)] @@ -1691,37 +1697,39 @@ def monotone_spline( kap=1e7, incl_uncstr=False, show_plot=False, - method='eilers', + method="eilers", splines=None, plot_start=0, - plot_end=100 + plot_end=100, ): """ New args: method (string): 'eilers' (old version) or 'pygam' (new version) - splines (None or array-like): for 'pygam' only (otherwise set None), + splines (None or array-like): for 'pygam' only (otherwise set None), number of splines used for each feature, if None use default - plot_start/plot_end (number between 0, 100): for 'pygam' only if show_plot = True, + plot_start/plot_end (number between 0, 100): for 'pygam' only if show_plot = True, start and end for percentile of data used in plot, can result in better visualizations if original data has strong outliers Returns: - xNew (numpy array): 2d with second dimension m, first - dimension is product of elements in bins, with each entry + xNew (numpy array): 2d with second dimension m, first + dimension is product of elements in bins, with each entry representative of bin across all the features yNew (numpy array): 1d with length same as first dimension - of xWeight, weighted average of y's corresponding to each + of xWeight, weighted average of y's corresponding to each entry of xWeight weightsNew (numpy array): 1d with length same as yNew, weight corresponding to each xNew, yNew row """ - if method == 'pygam': + if method == "pygam": if splines != None and len(splines) != x.shape[1]: err_msg = ( - " pygam method requires splines to be None or " + - " same length as # of columns in x, " + len(splines) - + " != " + str(x.shape[1]) + " pygam method requires splines to be None or " + + " same length as # of columns in x, " + + len(splines) + + " != " + + str(x.shape[1]) ) raise ValueError(err_msg) @@ -1729,23 +1737,27 @@ def monotone_spline( if bins == None: x_binned, y_binned, weights_binned = x, y, weights else: - x_binned, y_binned, weights_binned = avg_by_bin_multd(x, y, bins, weights) + x_binned, y_binned, weights_binned = avg_by_bin_multd( + x, y, bins, weights + ) - # setup pygam parameters- in addition to 's' spline terms, can also have 't' tensor + # setup pygam parameters- in addition to 's' spline terms, can also have 't' tensor # terms which are interactions between two variables. 't' terms also need monotonic constraints # to satisfy previous constraints, they actually impose stronger restriction if splines == None: - tempCstr = s(0, constraints='monotonic_inc') + tempCstr = s(0, constraints="monotonic_inc") for i in range(1, x_binned.shape[1]): - tempCstr += s(i, constraints='monotonic_inc') + tempCstr += s(i, constraints="monotonic_inc") tempUncstr = s(0) for i in range(1, x_binned.shape[1]): tempUncstr += s(i) else: - tempCstr = s(0, constraints='monotonic_inc', n_splines=splines[0]) + tempCstr = s(0, constraints="monotonic_inc", n_splines=splines[0]) for i in range(1, x_binned.shape[1]): - tempCstr += s(i, constraints='monotonic_inc', n_splines=splines[i]) + tempCstr += s( + i, constraints="monotonic_inc", n_splines=splines[i] + ) tempUncstr = s(0, n_splines=splines[0]) for i in range(1, x_binned.shape[1]): tempUncstr += s(i, n_splines=splines[i]) @@ -1755,7 +1767,9 @@ def monotone_spline( y_cstr = gamCstr.predict(x_binned) wsse_cstr = (weights_binned * ((y_cstr - y_binned) ** 2)).sum() if incl_uncstr: - gamUncstr = LinearGAM(tempUncstr).fit(x_binned, y_binned, weights_binned) + gamUncstr = LinearGAM(tempUncstr).fit( + x_binned, y_binned, weights_binned + ) y_uncstr = gamUncstr.predict(x_binned) wsse_uncstr = (weights_binned * ((y_uncstr - y_binned) ** 2)).sum() else: @@ -1766,18 +1780,45 @@ def monotone_spline( if x.shape[1] == 2: fig, ax = plt.subplots(subplot_kw={"projection": "3d"}) # select data in [plot_start, end] percentile across both features - # this can be rewritten to generalize for n-dimensions, but didn't know how to plot that - xactPlot = x[(x[:,0] >= np.percentile(x[:,0], plot_start)) & (x[:,0] <= np.percentile(x[:,0], plot_end)) & - (x[:,1] >= np.percentile(x[:,1], plot_start)) & (x[:,1] <= np.percentile(x[:,1], plot_end))] - yactPlot = y[(x[:,0] >= np.percentile(x[:,0], plot_start)) & (x[:,0] <= np.percentile(x[:,0], plot_end)) & - (x[:,1] >= np.percentile(x[:,1], plot_start)) & (x[:,1] <= np.percentile(x[:,1], plot_end))] - ax.scatter(xactPlot[:, 0], xactPlot[:, 1], yactPlot, color="black", s=0.8, alpha=0.25) - - x0 = np.linspace(np.percentile(x[:,0], plot_start),np.percentile(x[:,0], plot_end), 1000) - x1 = np.linspace(np.percentile(x[:,1], plot_start),np.percentile(x[:,1], plot_end), 1000) + # this can be rewritten to generalize for n-dimensions, but didn't know how to plot that + xactPlot = x[ + (x[:, 0] >= np.percentile(x[:, 0], plot_start)) + & (x[:, 0] <= np.percentile(x[:, 0], plot_end)) + & (x[:, 1] >= np.percentile(x[:, 1], plot_start)) + & (x[:, 1] <= np.percentile(x[:, 1], plot_end)) + ] + yactPlot = y[ + (x[:, 0] >= np.percentile(x[:, 0], plot_start)) + & (x[:, 0] <= np.percentile(x[:, 0], plot_end)) + & (x[:, 1] >= np.percentile(x[:, 1], plot_start)) + & (x[:, 1] <= np.percentile(x[:, 1], plot_end)) + ] + ax.scatter( + xactPlot[:, 0], + xactPlot[:, 1], + yactPlot, + color="black", + s=0.8, + alpha=0.25, + ) + + x0 = np.linspace( + np.percentile(x[:, 0], plot_start), + np.percentile(x[:, 0], plot_end), + 1000, + ) + x1 = np.linspace( + np.percentile(x[:, 1], plot_start), + np.percentile(x[:, 1], plot_end), + 1000, + ) X0, X1 = np.meshgrid(x0, x1) - yPred = gamCstr.predict(np.array([X0.flatten(), X1.flatten()]).T) - ax.plot_surface(X0, X1, yPred.reshape(x0.shape[0], -1), color = 'red') + yPred = gamCstr.predict( + np.array([X0.flatten(), X1.flatten()]).T + ) + ax.plot_surface( + X0, X1, yPred.reshape(x0.shape[0], -1), color="red" + ) ax.set_label("Monotonic GAM spline with all data") if x.shape[1] == 1: @@ -1813,11 +1854,13 @@ def interp(x): return interp, y_cstr, wsse_cstr, y_uncstr, wsse_uncstr - if method == 'eilers': + if method == "eilers": # create binned and weighted x and y data if bins: if not np.isscalar(bins): - err_msg = "monotone_spline2 ERROR: bins value is not type scalar" + err_msg = ( + "monotone_spline2 ERROR: bins value is not type scalar" + ) raise ValueError(err_msg) N = bins x_binned, y_binned, weights_binned = utils.avg_by_bin( diff --git a/tests/test_txfunc.py b/tests/test_txfunc.py index bfdb18378..5bf43efa8 100644 --- a/tests/test_txfunc.py +++ b/tests/test_txfunc.py @@ -1,5 +1,6 @@ import sys -sys.path.append('/Users/praneet/developer/OG-Core/') + +sys.path.append("/Users/praneet/developer/OG-Core/") from ogcore import txfunc from distributed import Client, LocalCluster import pytest @@ -735,7 +736,7 @@ def test_monotone_spline(): np.random.seed(10) # another test case for pygam method: - + # N = 100 # xlo = 0.001 # xhi = 2 * np.pi @@ -753,18 +754,18 @@ def test_monotone_spline(): # y = y0 + np.random.random(y0.shape) * 0.2 # weights = np.ones(10000) # ( - # mono_interp, - # y_cstr, - # wsse_cstr, - # y_uncstr, - # wsse_uncstr, + # mono_interp, + # y_cstr, + # wsse_cstr, + # y_uncstr, + # wsse_uncstr, # ) = txfunc.monotone_spline( - # X, y, weights, lam=100, incl_uncstr=True, show_plot=True, method='pygam' + # X, y, weights, lam=100, incl_uncstr=True, show_plot=True, method='pygam' # ) - with open('test_io_data/micro_data_dict_for_tests.pkl', 'rb') as f: - data = pickle.load(f)['2030'] - df = data[['total_labinc', 'total_capinc', 'etr', 'weight']] + with open("test_io_data/micro_data_dict_for_tests.pkl", "rb") as f: + data = pickle.load(f)["2030"] + df = data[["total_labinc", "total_capinc", "etr", "weight"]] df.replace([np.inf, -np.inf], np.nan, inplace=True) df.dropna(inplace=True) @@ -776,8 +777,17 @@ def test_monotone_spline(): y_uncstr1, wsse_uncstr1, ) = txfunc.monotone_spline( - df[['total_labinc', 'total_capinc']].values, df['etr'].values, df['weight'].values, bins=[100, 100], lam=100, incl_uncstr=True, - show_plot=False, method='pygam', splines=[100, 100], plot_start=25, plot_end=75 + df[["total_labinc", "total_capinc"]].values, + df["etr"].values, + df["weight"].values, + bins=[100, 100], + lam=100, + incl_uncstr=True, + show_plot=False, + method="pygam", + splines=[100, 100], + plot_start=25, + plot_end=75, ) # Test whether mono_interp is a function assert hasattr(mono_interp1, "__call__") @@ -799,7 +809,14 @@ def test_monotone_spline(): y_uncstr2, wsse_uncstr2, ) = txfunc.monotone_spline( - x, y, weights, bins=10, lam=100, incl_uncstr=True, show_plot=False, method='eilers' + x, + y, + weights, + bins=10, + lam=100, + incl_uncstr=True, + show_plot=False, + method="eilers", ) # Test whether mono_interp is a function assert hasattr(mono_interp2, "__call__") @@ -845,4 +862,3 @@ def test_monotone_spline(): assert np.allclose(wsse_cstr2, wsse_cstr_expected) assert np.allclose(y_uncstr2, y_uncstr_expected) assert np.allclose(wsse_uncstr2, wsse_uncstr_expected) - From 30cb743bdf3c4a6075d1cfa5759102c924420160 Mon Sep 17 00:00:00 2001 From: prrathi <53785742+prrathi@users.noreply.github.com> Date: Wed, 1 Mar 2023 20:27:23 -0600 Subject: [PATCH 07/18] remove my local path --- tests/test_txfunc.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_txfunc.py b/tests/test_txfunc.py index bf54fae2a..76300c85a 100644 --- a/tests/test_txfunc.py +++ b/tests/test_txfunc.py @@ -1,6 +1,5 @@ import sys -sys.path.append("/Users/praneet/developer/OG-Core/") from ogcore import txfunc from distributed import Client, LocalCluster import pytest From 6887d02ed6165199537cd9ebe73480e1fd8c99b6 Mon Sep 17 00:00:00 2001 From: prrathi <53785742+prrathi@users.noreply.github.com> Date: Thu, 2 Mar 2023 21:44:20 -0600 Subject: [PATCH 08/18] suggested file read changes --- tests/test_txfunc.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_txfunc.py b/tests/test_txfunc.py index 76300c85a..5b83d983f 100644 --- a/tests/test_txfunc.py +++ b/tests/test_txfunc.py @@ -762,8 +762,9 @@ def test_monotone_spline(): # X, y, weights, lam=100, incl_uncstr=True, show_plot=True, method='pygam' # ) - with open("test_io_data/micro_data_dict_for_tests.pkl", "rb") as f: - data = pickle.load(f)["2030"] + data = utils.safe_read_pickle( + os.path.join(CUR_PATH, "test_io_data", "micro_data_dict_for_tests.pkl") + )["2030"] df = data[["total_labinc", "total_capinc", "etr", "weight"]] df.replace([np.inf, -np.inf], np.nan, inplace=True) df.dropna(inplace=True) From 3de3c813770dfd28d8fb0fe8ffee3877a4bba403 Mon Sep 17 00:00:00 2001 From: Praneet Rathi Date: Sat, 4 Mar 2023 14:00:15 -0600 Subject: [PATCH 09/18] add 2dmono option --- ogcore/default_parameters.json | 3 ++- ogcore/txfunc.py | 22 +++++++++++++++++++--- tests/test_txfunc.py | 5 +++-- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/ogcore/default_parameters.json b/ogcore/default_parameters.json index 61af73915..e8f5ed580 100644 --- a/ogcore/default_parameters.json +++ b/ogcore/default_parameters.json @@ -3146,7 +3146,8 @@ "DEP_totalinc", "GS", "linear", - "mono" + "mono", + "mono2D" ] } } diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py index d5d60d42c..bc4beefc2 100644 --- a/ogcore/txfunc.py +++ b/ogcore/txfunc.py @@ -253,7 +253,9 @@ def get_tax_rates( elif tax_func_type == "mono": mono_interp = params[0] txrates = mono_interp(income) - + elif tax_func_type == "mono2D": + mono_interp = params[0] + txrates = mono_interp([[X, Y]]) return txrates @@ -691,11 +693,23 @@ def txfunc_est( wsse = wsse_cstr params = mono_interp params_to_plot = params + elif tax_func_type == "mono2D": + mono_interp, _, wsse_cstr, _, _ = monotone_spline( + df[["total_labinc", "total_capinc"]].values, + df["etr"].values, + df["weight"].values, + bins=[100, 100], + method="pygam", + splines=[100, 100], + ) + wsse = wsse_cstr + params = mono_interp + params_to_plot = params else: raise RuntimeError( "Choice of tax function is not in the set of" + " possible tax functions. Please select" - + " from: DEP, DEP_totalinc, GS, linear." + + " from: DEP, DEP_totalinc, GS, linear, mono, mono2D." ) if graph: pp.txfunc_graph( @@ -1722,11 +1736,13 @@ def monotone_spline( """ if method == "pygam": + if len(x.shape) == 1: + x = np.expand_dims(x, axis=1) if splines != None and len(splines) != x.shape[1]: err_msg = ( " pygam method requires splines to be None or " + " same length as # of columns in x, " - + len(splines) + + str(len(splines)) + " != " + str(x.shape[1]) ) diff --git a/tests/test_txfunc.py b/tests/test_txfunc.py index 5b83d983f..f4d76e966 100644 --- a/tests/test_txfunc.py +++ b/tests/test_txfunc.py @@ -765,7 +765,7 @@ def test_monotone_spline(): data = utils.safe_read_pickle( os.path.join(CUR_PATH, "test_io_data", "micro_data_dict_for_tests.pkl") )["2030"] - df = data[["total_labinc", "total_capinc", "etr", "weight"]] + df = data[["total_labinc", "total_capinc", "etr", "weight"]].copy() df.replace([np.inf, -np.inf], np.nan, inplace=True) df.dropna(inplace=True) @@ -781,7 +781,6 @@ def test_monotone_spline(): df["etr"].values, df["weight"].values, bins=[100, 100], - lam=100, incl_uncstr=True, show_plot=False, method="pygam", @@ -821,6 +820,8 @@ def test_monotone_spline(): # Test whether mono_interp is a function assert hasattr(mono_interp2, "__call__") + # Test whether mono_interp is a function + assert hasattr(mono_interp2, "__call__") # Test whether mono_interp gives the correct output x_vec_test = np.array([2.0, 5.0]) y_vec_expected = np.array([0.69512331, 1.23822669]) From f96dae545621a3b9f4bb638eea269617884cd17b Mon Sep 17 00:00:00 2001 From: Praneet Rathi Date: Sat, 4 Mar 2023 14:25:52 -0600 Subject: [PATCH 10/18] small bug --- tests/test_parameters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_parameters.py b/tests/test_parameters.py index 741721fd0..b708638e7 100644 --- a/tests/test_parameters.py +++ b/tests/test_parameters.py @@ -157,7 +157,7 @@ def test_implement_bad_reform2(): assert len(specs.errors) > 0 assert specs.errors["tax_func_type"][0] == ( 'tax_func_type "not_a_functional_form" must be in list of ' - + "choices DEP, DEP_totalinc, GS, linear, mono." + + "choices DEP, DEP_totalinc, GS, linear, mono, mono2D." ) From ed3ec31105b2445b5e123f8f9d403eeca340f505 Mon Sep 17 00:00:00 2001 From: jdebacker Date: Thu, 9 Mar 2023 21:34:38 -0500 Subject: [PATCH 11/18] return obs --- ogcore/txfunc.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py index bc4beefc2..9c48e01c8 100644 --- a/ogcore/txfunc.py +++ b/ogcore/txfunc.py @@ -694,6 +694,7 @@ def txfunc_est( params = mono_interp params_to_plot = params elif tax_func_type == "mono2D": + obs = df.shape[0] mono_interp, _, wsse_cstr, _, _ = monotone_spline( df[["total_labinc", "total_capinc"]].values, df["etr"].values, @@ -1328,6 +1329,7 @@ def tax_func_estimate( "GS": 3, "linear": 1, "mono": 1, + "mono2D": 1, } numparams = int(tax_func_type_num_params_dict[tax_func_type]) years_list = np.arange(start_year, end_yr + 1) From 5171b3d28f197d9351e6720aff2cc14aa9b5a7c4 Mon Sep 17 00:00:00 2001 From: jdebacker Date: Fri, 10 Mar 2023 12:22:15 -0500 Subject: [PATCH 12/18] use cloudpickle --- ogcore/txfunc.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py index 9c48e01c8..784176c81 100644 --- a/ogcore/txfunc.py +++ b/ogcore/txfunc.py @@ -16,6 +16,7 @@ from dask import delayed, compute import dask.multiprocessing import pickle +import cloudpickle from scipy.interpolate import interp1d as intp import matplotlib.pyplot as plt import ogcore.parameter_plots as pp @@ -696,9 +697,12 @@ def txfunc_est( elif tax_func_type == "mono2D": obs = df.shape[0] mono_interp, _, wsse_cstr, _, _ = monotone_spline( - df[["total_labinc", "total_capinc"]].values, - df["etr"].values, - df["weight"].values, + # df[["total_labinc", "total_capinc"]].values, + # df["etr"].values, + # df["weight"].values, + np.vstack((X,Y)).T, + # X, Y, + txrates, wgts, bins=[100, 100], method="pygam", splines=[100, 100], @@ -1644,8 +1648,12 @@ def tax_func_estimate( ) if tax_func_path: - with open(tax_func_path, "wb") as f: - pickle.dump(dict_params, f) + try: + with open(tax_func_path, "wb") as f: + pickle.dump(dict_params, f) + except AttributeError: + with open(tax_func_path, "wb") as f: + cloudpickle.dump(dict_params, f) return dict_params From 71e354d40c172e1363a16f059e62fa396b3b16c9 Mon Sep 17 00:00:00 2001 From: prrathi <53785742+prrathi@users.noreply.github.com> Date: Mon, 20 Mar 2023 16:49:39 -0500 Subject: [PATCH 13/18] Update txfunc.py --- ogcore/txfunc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py index 784176c81..ef9e31f1c 100644 --- a/ogcore/txfunc.py +++ b/ogcore/txfunc.py @@ -1727,7 +1727,7 @@ def monotone_spline( ): """ New args: - method (string): 'eilers' (old version) or 'pygam' (new version) + method (string): 'eilers' or 'pygam' splines (None or array-like): for 'pygam' only (otherwise set None), number of splines used for each feature, if None use default plot_start/plot_end (number between 0, 100): for 'pygam' only if show_plot = True, From f8302d4b518ebb8d57e5a92c6e8ff487a860af90 Mon Sep 17 00:00:00 2001 From: prrathi <53785742+prrathi@users.noreply.github.com> Date: Mon, 20 Mar 2023 16:50:40 -0500 Subject: [PATCH 14/18] Update txfunc.py --- ogcore/txfunc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py index ef9e31f1c..78804935a 100644 --- a/ogcore/txfunc.py +++ b/ogcore/txfunc.py @@ -1726,7 +1726,7 @@ def monotone_spline( plot_end=100, ): """ - New args: + Args: method (string): 'eilers' or 'pygam' splines (None or array-like): for 'pygam' only (otherwise set None), number of splines used for each feature, if None use default From 310670fc932947a4bea4b51658c8815173475b55 Mon Sep 17 00:00:00 2001 From: Praneet Rathi Date: Sun, 16 Apr 2023 17:21:44 -0500 Subject: [PATCH 15/18] black formatting + txfunc update --- ogcore/txfunc.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py index 4234d9a61..3b5bcf0b4 100644 --- a/ogcore/txfunc.py +++ b/ogcore/txfunc.py @@ -307,8 +307,64 @@ def get_tax_rates( ] txrates = np.array(txrates) elif tax_func_type == "mono2D": - mono_interp = params[0] - txrates = mono_interp([[X, Y]]) + if for_estimation: + mono_interp = params[0] + txrates = mono_interp([[X, Y]]) + else: + if np.isscalar(X) and np.isscalar(Y): + txrates = params[0]([[X, Y]]) + elif X.ndim == 1 and np.isscalar(Y): + if (X.shape[0] == len(params)) and ( + len(params) > 1 + ): # for case where loops over S + txrates = [ + params[s][0]([[X[s], Y]]) for s in range(income.shape[0]) + ] + else: + txrates = [ + params[0](income[i]) for i in range(income.shape[0]) + ] + elif np.isscalar(X) and Y.ndim == 1: + if (Y.shape[0] == len(params)) and ( + len(params) > 1 + ): # for case where loops over S + txrates = [ + params[s][0]([[X, Y[s]]]) for s in range(income.shape[0]) + ] + else: + txrates = [ + params[0](income[i]) for i in range(income.shape[0]) + ] + elif X.ndim == 1 and Y.ndim == 1: + if (X.shape[0] == Y.shape[0] == len(params)) and (len(params) > 1): + txrates = [ + params[s][0]([[X[s], Y[s]]]) + for s in range(X.shape[0]) + ] + else: + txrates = [ + params[0]([[X[i], Y[i]]]) for i in range(X.shape[0]) + ] + elif X.ndim == 2 and Y.ndim == 2: + txrates = [ + [ + params[s][j][0]([[X[s, j], Y[s, j]]]) + for j in range(X.shape[1]) + ] + for s in range(X.shape[0]) + ] + else: # to catch 3D arrays, looping over T, S, J + txrates = [ + [ + [ + params[t][s][j][0]([[X[t, s, j], Y[t, s, j]]]) + for j in range(income.shape[2]) + ] + for s in range(income.shape[1]) + ] + for t in range(income.shape[0]) + ] + txrates = np.array(txrates) return txrates From fa2362275c3ca18ef34d43cc1adbc3d96d58bc4e Mon Sep 17 00:00:00 2001 From: Praneet Rathi Date: Sun, 16 Apr 2023 17:22:20 -0500 Subject: [PATCH 16/18] black --- ogcore/txfunc.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py index 3b5bcf0b4..ebf8c2f55 100644 --- a/ogcore/txfunc.py +++ b/ogcore/txfunc.py @@ -318,7 +318,8 @@ def get_tax_rates( len(params) > 1 ): # for case where loops over S txrates = [ - params[s][0]([[X[s], Y]]) for s in range(income.shape[0]) + params[s][0]([[X[s], Y]]) + for s in range(income.shape[0]) ] else: txrates = [ @@ -329,17 +330,19 @@ def get_tax_rates( len(params) > 1 ): # for case where loops over S txrates = [ - params[s][0]([[X, Y[s]]]) for s in range(income.shape[0]) + params[s][0]([[X, Y[s]]]) + for s in range(income.shape[0]) ] else: txrates = [ params[0](income[i]) for i in range(income.shape[0]) ] elif X.ndim == 1 and Y.ndim == 1: - if (X.shape[0] == Y.shape[0] == len(params)) and (len(params) > 1): + if (X.shape[0] == Y.shape[0] == len(params)) and ( + len(params) > 1 + ): txrates = [ - params[s][0]([[X[s], Y[s]]]) - for s in range(X.shape[0]) + params[s][0]([[X[s], Y[s]]]) for s in range(X.shape[0]) ] else: txrates = [ @@ -365,7 +368,7 @@ def get_tax_rates( for t in range(income.shape[0]) ] txrates = np.array(txrates) - + return txrates @@ -809,9 +812,10 @@ def txfunc_est( # df[["total_labinc", "total_capinc"]].values, # df["etr"].values, # df["weight"].values, - np.vstack((X,Y)).T, + np.vstack((X, Y)).T, # X, Y, - txrates, wgts, + txrates, + wgts, bins=[100, 100], method="pygam", splines=[100, 100], @@ -1841,7 +1845,7 @@ def monotone_spline( plot_start/plot_end (number between 0, 100): for 'pygam' only if show_plot = True, start and end for percentile of data used in plot, can result in better visualizations if original data has strong outliers - + Returns: xNew (numpy array): 2d with second dimension m, first @@ -1853,7 +1857,6 @@ def monotone_spline( weightsNew (numpy array): 1d with length same as yNew, weight corresponding to each xNew, yNew row """ - if method == "pygam": if len(x.shape) == 1: @@ -1993,10 +1996,14 @@ def interp(x): # create binned and weighted x and y data if bins: if not np.isscalar(bins): - err_msg = "monotone_spline2 ERROR: bins value is not type scalar" + err_msg = ( + "monotone_spline2 ERROR: bins value is not type scalar" + ) raise ValueError(err_msg) N = int(bins) - x_binned, y_binned, weights_binned = utils.avg_by_bin(x, y, weights, N) + x_binned, y_binned, weights_binned = utils.avg_by_bin( + x, y, weights, N + ) elif not bins: N = len(x) From 3ca2f4afe5829e955cb12ce5a92eae54a2f122a9 Mon Sep 17 00:00:00 2001 From: Richard Evans Date: Fri, 21 Apr 2023 16:17:32 -0600 Subject: [PATCH 17/18] Made jdebacker updates to txfunc.py --- ogcore/txfunc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py index ebf8c2f55..c58553773 100644 --- a/ogcore/txfunc.py +++ b/ogcore/txfunc.py @@ -367,7 +367,7 @@ def get_tax_rates( ] for t in range(income.shape[0]) ] - txrates = np.array(txrates) + txrates = np.squeeze(np.array(txrates)) return txrates @@ -821,7 +821,7 @@ def txfunc_est( splines=[100, 100], ) wsse = wsse_cstr - params = mono_interp + params = [mono_interp] params_to_plot = params else: raise RuntimeError( From 1c93aff7a5d2340468c92c715ee68b013597cca3 Mon Sep 17 00:00:00 2001 From: Richard Evans Date: Fri, 21 Apr 2023 16:19:13 -0600 Subject: [PATCH 18/18] Updated version number in __init__.py and setup.py --- ogcore/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ogcore/__init__.py b/ogcore/__init__.py index 6b1322859..cc94f5a10 100644 --- a/ogcore/__init__.py +++ b/ogcore/__init__.py @@ -19,4 +19,4 @@ from ogcore.txfunc import * from ogcore.utils import * -__version__ = "0.10.7" +__version__ = "0.10.8" diff --git a/setup.py b/setup.py index 9e5e104c7..1f625de4c 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="ogcore", - version="0.10.7", + version="0.10.8", author="Jason DeBacker and Richard W. Evans", license="CC0 1.0 Universal (CC0 1.0) Public Domain Dedication", description="A general equilibribum overlapping generations model for fiscal policy analysis",