From ed0c680b5abaf3a4a2fc7c74054cd55a022a17b9 Mon Sep 17 00:00:00 2001
From: Praneet Rathi <prrathi10@gmail.com>
Date: Tue, 27 Dec 2022 15:11:29 -0600
Subject: [PATCH 01/18] add monotone spline for 2d

---
 ogcore/txfunc.py     | 446 +++++++++++++++++++++++++++++++------------
 tests/test_txfunc.py |  78 ++++++--
 2 files changed, 390 insertions(+), 134 deletions(-)

diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py
index e26131351..827cb18f6 100644
--- a/ogcore/txfunc.py
+++ b/ogcore/txfunc.py
@@ -22,6 +22,11 @@
 from ogcore.constants import DEFAULT_START_YEAR, SHOW_RUNTIME
 from ogcore import utils
 import warnings
+from sklearn.ensemble import HistGradientBoostingRegressor
+from ogcore.isotonic import MultiIsotonicRegressor
+from pygam import LinearGAM, s, te
+from matplotlib import cm
+import random
 
 
 if not SHOW_RUNTIME:
@@ -1632,6 +1637,53 @@ def tax_func_estimate(
     return dict_params
 
 
+def avg_by_bin_2d(x, y, bins, weights = None):
+    """
+    Args:
+        x (numpy array): 2d with dimensions n by m used for binning
+        y (numpy array): 1d with length n 
+        bins (numpy array): 1d with length m, each entry must divide n
+            and is number of bins for corresponding column in x
+        weights (None or numpy array): 1d with length n specifying 
+            weight of each observation. if None then array of ones
+
+    Returns:
+        xNew (numpy array): 2d with second dimension m, first 
+            dimension is product of elements in bins, with each entry 
+            representative of bin across all the features
+        yNew (numpy array): 1d with length same as first dimension
+            of xWeight, weighted average of y's corresponding to each 
+            entry of xWeight
+        weightsNew (numpy array): 1d with length same as yNew, weight
+            corresponding to each xNew, yNew row
+    """
+    if x.shape[1] != len(bins):
+        message = "Dimensions of x and bins don't match: {} != {}".format(x.shape[1], len(bins))
+        raise ValueError(message)
+    else:
+        size = np.prod(bins)
+        tupleBins = tuple(bins)
+        xNew = np.zeros((size, x.shape[1]), dtype=float)
+        yNew = np.zeros(size, dtype=float)
+        weightsNew = np.zeros(size, dtype=float)
+
+        # iterate through each of the final bins, which consists of bins for each feature
+        # for each, only retain entries falling in that bin
+        for i in range(size):
+            index = list(np.unravel_index(i, tupleBins))
+            valid = np.ones(x.shape[0], dtype=bool)
+            for j, v in enumerate(index):
+                valid &= (x[:, j] >= np.percentile(x[:, j], v*100/bins[j])) & (x[:, j] < np.percentile(x[:, j], (v+1)*100/bins[j]))
+            if np.sum(valid) != 0:
+                xNew[i, :] = np.average(x[valid], axis=0, weights=weights[valid])
+                yNew[i] = np.average(y[valid], axis=0, weights=weights[valid])
+                weightsNew[i] = np.sum(weights[valid])
+        xNew = xNew[~(weightsNew == 0)]
+        yNew = yNew[~(weightsNew == 0)]
+        weightsNew = weightsNew[~(weightsNew == 0)]
+        return xNew, yNew, weightsNew
+
+
 def monotone_spline(
     x,
     y,
@@ -1641,142 +1693,294 @@ def monotone_spline(
     kap=1e7,
     incl_uncstr=False,
     show_plot=False,
+    method='eilers',
+    splines=None,
+    plot_start=0,
+    plot_end=100
 ):
-    # create binned and weighted x and y data
-    if bins:
-        if not np.isscalar(bins):
-            err_msg = "monotone_spline2 ERROR: bins value is not type scalar"
-            raise ValueError(err_msg)
-        N = bins
-        x_binned, y_binned, weights_binned = utils.avg_by_bin(
-            x, y, weights, bins
-        )
+    """
+    New args:
+        method (string): 'eilers' (old version) or 'pygam' (new version)
+        splines (None or array-like): for 'pygam' only (otherwise set None), 
+            number of splines used for each feature, if None use default
+        plot_start/plot_end (number between 0, 100): for 'pygam' only if show_plot = True, 
+            start and end for percentile of data used in plot, can result in
+            better visualizations if original data has strong outliers
 
-    elif not bins:
-        N = len(x)
-        x_binned = x
-        y_binned = y
-        weights_binned = weights
-
-    # Prepare bases (Imat) and penalty
-    dd = 3
-    E = np.eye(N)
-    D3 = np.diff(E, n=dd, axis=0)
-    D1 = np.diff(E, n=1, axis=0)
-
-    # Monotone smoothing
-    ws = np.zeros(N - 1)
-    weights_binned = weights_binned.reshape(len(weights_binned), 1)
-    weights1 = 0.5 * weights_binned[1:, :] + 0.5 * weights_binned[:-1, :]
-    weights3 = (
-        0.25 * weights_binned[3:, :]
-        + 0.25 * weights_binned[2:-1, :]
-        + 0.25 * weights_binned[1:-2, :]
-        + 0.25 * weights_binned[:-3, :]
-    ).flatten()
-
-    for it in range(30):
-        Ws = np.diag(ws * kap)
-        mon_cof = np.linalg.solve(
-            E
-            + lam * D3.T @ np.diag(weights3) @ D3
-            + D1.T @ (Ws * weights1) @ D1,
-            y_binned,
-        )
-        ws_new = (D1 @ mon_cof < 0.0) * 1
-        dw = np.sum(ws != ws_new)
-        ws = ws_new
-        if dw == 0:
-            break
-
-    # Monotonic and non monotonic fits
-    y_cstr = mon_cof
-    wsse_cstr = (weights_binned * ((y_cstr - y_binned) ** 2)).sum()
-    if incl_uncstr:
-        y_uncstr = np.linalg.solve(
-            E + lam * D3.T @ np.diag(weights3) @ D3, y_binned
-        )
-        wsse_uncstr = (weights_binned * ((y_uncstr - y_binned) ** 2)).sum()
-    else:
-        y_uncstr = None
-        wsse_uncstr = None
-
-    def mono_interp(x_vec):
-        # replace last point in data with two copies further out to make smooth
-        # extrapolation
-        x_new = np.append(
-            x_binned[:-1], [1.005 * x_binned[-1], 1.01 * x_binned[-1]]
-        )
-        y_cstr_new = np.append(y_cstr[:-1], [y_cstr[-1], y_cstr[-1]])
-        # Create interpolating cubic spline for interior points
-        inter_interpl = intp(x_new, y_cstr_new, kind="cubic")
-        y_pred = np.zeros_like(x_vec)
-        x_lt_min = x_vec < x_binned.min()
-        x_gt_max = x_vec > x_new.max()
-        x_inter = (x_vec >= x_binned.min()) & (x_vec <= x_new.max())
-        y_pred[x_inter] = inter_interpl(x_vec[x_inter])
-        # extrapolate the maximum for values above the maximum
-        y_pred[x_gt_max] = y_cstr[-1]
-        # linear extrapolation of last two points for values below the min
-        slope = (y_cstr[1] - y_cstr[0]) / (x_binned[1] - x_binned[0])
-        intercept = y_cstr[0] - slope * x_binned[0]
-        y_pred[x_lt_min] = slope * x_vec[x_lt_min] + intercept
-
-        return y_pred
-
-    if show_plot:
-        plt.scatter(
-            x,
-            y,
-            linestyle="None",
-            color="gray",
-            s=0.8,
-            alpha=0.7,
-            label="All data",
-        )
-        if not bins:
-            plt.plot(
-                x,
-                y_cstr,
-                color="red",
-                alpha=1.0,
-                label="Monotonic smooth spline",
+    Returns:
+        xNew (numpy array): 2d with second dimension m, first 
+            dimension is product of elements in bins, with each entry 
+            representative of bin across all the features
+        yNew (numpy array): 1d with length same as first dimension
+            of xWeight, weighted average of y's corresponding to each 
+            entry of xWeight
+        weightsNew (numpy array): 1d with length same as yNew, weight
+            corresponding to each xNew, yNew row
+    """
+
+    if method == 'pygam':
+        if splines != None and len(splines) != x.shape[1]:
+            err_msg = (
+                " pygam method requires splines to be None or " + 
+                " same length as # of columns in x, " + len(splines)
+                 + " != " + str(x.shape[1])
             )
-            if incl_uncstr:
+            raise ValueError(err_msg)
+
+        # bin data
+        if bins == None:
+            x_binned, y_binned, weights_binned = x, y, weights
+        else:
+            x_binned, y_binned, weights_binned = avg_by_bin_2d(x, y, bins, weights)
+
+        # setup pygam parameters- in addition to 's' spline terms, can also have 't' tensor 
+        # terms which are interactions between two variables. 't' terms also need monotonic constraints
+        # to satisfy previous constraints, they actually impose stronger restriction
+
+        if splines == None:
+            tempCstr = s(0, constraints='monotonic_inc')
+            for i in range(1, x_binned.shape[1]):
+                tempCstr += s(i, constraints='monotonic_inc')
+            tempUncstr = s(0)
+            for i in range(1, x_binned.shape[1]):
+                tempUncstr += s(i)
+        else:
+            tempCstr = s(0, constraints='monotonic_inc', n_splines=splines[0])
+            for i in range(1, x_binned.shape[1]):
+                tempCstr += s(i, constraints='monotonic_inc', n_splines=splines[i])
+            tempUncstr = s(0, n_splines=splines[0])
+            for i in range(1, x_binned.shape[1]):
+                tempUncstr += s(i, n_splines=splines[i])
+
+        # fit data
+        gamCstr = LinearGAM(tempCstr).fit(x_binned, y_binned, weights_binned)
+        y_cstr = gamCstr.predict(x_binned)
+        wsse_cstr = (weights_binned * ((y_cstr - y_binned) ** 2)).sum()
+        if incl_uncstr:
+            gamUncstr = LinearGAM(tempUncstr).fit(x_binned, y_binned, weights_binned)
+            y_uncstr = gamUncstr.predict(x_binned)
+            wsse_uncstr = (weights_binned * ((y_uncstr - y_binned) ** 2)).sum()
+        else:
+            y_uncstr = None
+            wsse_uncstr = None
+        print("Constrained: " + str(wsse_cstr))
+        print("Unconstrained: " + str(wsse_uncstr))
+
+        if show_plot:
+            if x.shape[1] == 2:
+                fig, ax = plt.subplots(subplot_kw={"projection": "3d"})
+                # select data in [plot_start, end] percentile across both features
+                # this can be rewritten to generalize for n-dimensions, but didn't know how to plot that 
+                xactPlot = x[(x[:,0] >= np.percentile(x[:,0], plot_start)) & (x[:,0] <= np.percentile(x[:,0], plot_end)) & 
+                    (x[:,1] >= np.percentile(x[:,1], plot_start)) & (x[:,1] <= np.percentile(x[:,1], plot_end))]
+                yactPlot = y[(x[:,0] >= np.percentile(x[:,0], plot_start)) & (x[:,0] <= np.percentile(x[:,0], plot_end)) & 
+                    (x[:,1] >= np.percentile(x[:,1], plot_start)) & (x[:,1] <= np.percentile(x[:,1], plot_end))]
+                ax.scatter(xactPlot[:, 0], xactPlot[:, 1], yactPlot, color="black", s=0.8, alpha=0.25)
+
+                x0 = np.linspace(np.percentile(x[:,0], plot_start),np.percentile(x[:,0], plot_end), 1000)
+                x1 = np.linspace(np.percentile(x[:,1], plot_start),np.percentile(x[:,1], plot_end), 1000)
+                X0, X1 = np.meshgrid(x0, x1)
+                yPred = gamCstr.predict(np.array([X0.flatten(), X1.flatten()]).T)
+                ax.plot_surface(X0, X1, yPred.reshape(x0.shape[0], -1), color = 'red')
+                ax.set_label("Monotonic GAM spline with all data")
+
+            if x.shape[1] == 1:
+                plt.scatter(
+                    x,
+                    y,
+                    linestyle="None",
+                    color="gray",
+                    s=0.8,
+                    alpha=0.7,
+                    label="All data",
+                )
                 plt.plot(
                     x,
-                    y_uncstr,
-                    color="blue",
+                    y_cstr,
+                    color="red",
                     alpha=1.0,
-                    label="Unconstrained smooth spline",
+                    label="Monotonic GAM spline",
                 )
+                if incl_uncstr:
+                    plt.plot(
+                        x,
+                        y_uncstr,
+                        color="blue",
+                        alpha=1.0,
+                        label="Unconstrained GAM spline",
+                    )
+            plt.show()
+            plt.close()
+
+        def interp(x):
+            return gamCstr.predict(x)
+
+        return interp, y_cstr, wsse_cstr, y_uncstr, wsse_uncstr
+
+    if method == 'eilers':
+        # create binned and weighted x and y data
+        if bins:
+            if not np.isscalar(bins):
+                err_msg = "monotone_spline2 ERROR: bins value is not type scalar"
+                raise ValueError(err_msg)
+            N = bins
+            x_binned, y_binned, weights_binned_ = utils.avg_by_bin(
+                x, y, weights, bins
+            )
+
+        elif not bins:
+            N = len(x)
+            x_binned = x
+            y_binned = y
+            weights_binned_ = weights
+
+        # Prepare bases (Imat) and penalty
+        dd = 3
+        E = np.eye(N)
+        D3 = np.diff(E, n=dd, axis=0)
+        D1 = np.diff(E, n=1, axis=0)
+
+        # Monotone smoothing
+        ws = np.zeros(N - 1)
+        weights_binned = weights_binned_.reshape(len(weights_binned_), 1)
+        weights1 = 0.5 * weights_binned[1:, :] + 0.5 * weights_binned[:-1, :]
+        weights3 = (
+            0.25 * weights_binned[3:, :]
+            + 0.25 * weights_binned[2:-1, :]
+            + 0.25 * weights_binned[1:-2, :]
+            + 0.25 * weights_binned[:-3, :]
+        ).flatten()
+
+        for it in range(30):
+            Ws = np.diag(ws * kap)
+            mon_cof = np.linalg.solve(
+                E + 
+                lam * D3.T @ np.diag(weights3) @ D3 # weights3
+                + D1.T @ (Ws * weights1) @ D1,
+                y_binned,
+            )
+            print(ws)
+            print(mon_cof)
+            ws_new = (D1 @ mon_cof < 0.0) * 1
+            dw = np.sum(ws != ws_new)
+            ws = ws_new
+            if dw == 0:
+                break
+
+        # Monotonic and non monotonic fits
+        y_cstr = mon_cof
+        wsse_cstr = (weights_binned * ((y_cstr - y_binned) ** 2)).sum()
+        if incl_uncstr:
+            y_uncstr = np.linalg.solve(
+                E + lam * D3.T @ np.diag(weights3) @ D3, y_binned # weights3
+            )
+            wsse_uncstr = (weights_binned * ((y_uncstr - y_binned) ** 2)).sum()
         else:
+            y_uncstr = None
+            wsse_uncstr = None
+
+        # breakpoint()
+        # hist = HistGradientBoostingRegressor(max_depth = 1).fit(
+            # np.array([x_binned]).T, y_binned, weights_binned_)
+        # y_cstr2 = hist.predict(np.array([x_binned]).T)
+        # wsse_cstr2 = (weights_binned * ((y_cstr2 - y_binned) ** 2)).sum()
+        # iso = MultiIsotonicRegressor()
+        # iso = iso.fit(np.array([x_binned]).T, y_binned)
+        # y_cstr2 = iso.predict(np.array([x_binned]).T)
+        # wsse_cstr2 = (weights_binned * ((y_cstr2 - y_binned) ** 2)).sum()
+
+
+
+        print("Constrained: " + str(wsse_cstr))
+        # print("Constrained2: " + str(wsse_cstr2))
+        print("unconstrained: " + str(wsse_uncstr))
+
+        def mono_interp(x_vec, y_cstr_):
+            # replace last point in data with two copies further out to make smooth
+            # extrapolation
+            x_new = np.append(
+                x_binned[:-1], [1.005 * x_binned[-1], 1.01 * x_binned[-1]]
+            )
+            y_cstr_new = np.append(y_cstr_[:-1], [y_cstr_[-1], y_cstr_[-1]])
+            # Create interpolating cubic spline for interior points
+            inter_interpl = intp(x_new, y_cstr_new, kind="cubic")
+            y_pred = np.zeros_like(x_vec)
+            x_lt_min = x_vec < x_binned.min()
+            x_gt_max = x_vec > x_new.max()
+            x_inter = (x_vec >= x_binned.min()) & (x_vec <= x_new.max())
+            y_pred[x_inter] = inter_interpl(x_vec[x_inter])
+            # extrapolate the maximum for values above the maximum
+            y_pred[x_gt_max] = y_cstr_[-1]
+            # linear extrapolation of last two points for values below the min
+            slope = (y_cstr_[1] - y_cstr_[0]) / (x_binned[1] - x_binned[0])
+            intercept = y_cstr_[0] - slope * x_binned[0]
+            y_pred[x_lt_min] = slope * x_vec[x_lt_min] + intercept
+
+            return y_pred
+
+        if show_plot:
             plt.scatter(
-                x_binned,
-                y_binned,
+                x,
+                y,
                 linestyle="None",
-                color="black",
+                color="gray",
                 s=0.8,
                 alpha=0.7,
-                label="Binned data averages",
+                label="All data",
             )
-            plt.plot(
-                x,
-                mono_interp(x),
-                color="red",
-                alpha=1.0,
-                label="Monotonic smooth spline",
-            )
-            if incl_uncstr:
+            if not bins:
                 plt.plot(
+                    x,
+                    y_cstr,
+                    color="red",
+                    alpha=1.0,
+                    label="Monotonic smooth spline",
+                )
+                if incl_uncstr:
+                    plt.plot(
+                        x,
+                        y_uncstr,
+                        color="blue",
+                        alpha=1.0,
+                        label="Unconstrained smooth spline",
+                    )
+            else:
+                plt.scatter(
                     x_binned,
-                    y_uncstr,
-                    color="blue",
+                    y_binned,
+                    linestyle="None",
+                    color="black",
+                    s=4,
+                    alpha=0.7,
+                    label="Binned data averages",
+                )
+                plt.plot(
+                    x,
+                    mono_interp(x, y_cstr),
+                    color="red",
                     alpha=1.0,
-                    label="Unconstrained smooth spline",
+                    label="Original monotonic smooth spline",
                 )
-        plt.legend(loc="lower right")
-        plt.show()
-        plt.close()
+                plt.plot(
+                    x,
+                    ygam,
+                    color="green",
+                    alpha=1.0,
+                    label="New monotonic smooth spline",
+                )
+                if incl_uncstr:
+                    plt.plot(
+                        x_binned,
+                        y_uncstr,
+                        color="blue",
+                        alpha=1.0,
+                        label="Unconstrained smooth spline",
+                    )
+            plt.legend(loc="lower right")
+            plt.show()
+            plt.close()
+
+        return mono_interp, y_cstr, wsse_cstr, y_uncstr, wsse_uncstr
 
-    return mono_interp, y_cstr, wsse_cstr, y_uncstr, wsse_uncstr
+    err_msg = method + " method not supported, must be 'eilers' or 'pygam'"
+    raise ValueError(err_msg)
diff --git a/tests/test_txfunc.py b/tests/test_txfunc.py
index 578cbb0c2..41278b302 100644
--- a/tests/test_txfunc.py
+++ b/tests/test_txfunc.py
@@ -1,3 +1,5 @@
+import sys
+sys.path.append('/Users/praneet/developer/OG-Core/')
 from ogcore import txfunc
 from distributed import Client, LocalCluster
 import pytest
@@ -731,6 +733,56 @@ def test_monotone_spline():
     """
     # Simulate some data
     np.random.seed(10)
+
+    # another test case for pygam method:
+    
+    # N = 100
+    # xlo = 0.001
+    # xhi = 2 * np.pi
+    # x1 = np.arange(xlo, xhi, step=(xhi - xlo) / N)
+    # x2 = np.arange(xlo, xhi, step=(xhi - xlo) / N)
+    # x = np.array([x1, x2]).T
+    # X1, X2 = np.meshgrid(x1, x2)
+    # X1, X2 = X1.flatten(), X2.flatten()
+    # X = np.zeros((X1.shape[0], 2))
+    # X[:,0] = X1
+    # X[:,1] = X2
+    # y0 = np.exp(np.sin(X1)) * np.exp(np.cos(X2))
+    # y0 = np.cos(X1)/(1.01 + np.sin(X2))
+    # y0 = 1/(1 + np.exp(-(X1+X2)))
+    # y = y0 + np.random.random(y0.shape) * 0.2
+    # weights = np.ones(10000)
+    # (
+        # mono_interp,
+        # y_cstr,
+        # wsse_cstr,
+        # y_uncstr,
+        # wsse_uncstr,
+    # ) = txfunc.monotone_spline(
+        # X, y, weights, lam=100, incl_uncstr=True, show_plot=True, method='pygam'
+    # )
+
+    with open('test_io_data/micro_data_dict_for_tests.pkl', 'rb') as f:
+        data = pickle.load(f)['2030']
+    df = data[['total_labinc', 'total_capinc', 'etr', 'weight']]
+    df.replace([np.inf, -np.inf], np.nan, inplace=True)
+    df.dropna(inplace=True)
+
+    # Estimate monotonically increasing function on the data
+    (
+        mono_interp1,
+        y_cstr1,
+        wsse_cstr1,
+        y_uncstr1,
+        wsse_uncstr1,
+    ) = txfunc.monotone_spline(
+        df[['total_labinc', 'total_capinc']].values, df['etr'].values, df['weight'].values, bins=[100, 100], lam=100, incl_uncstr=True, 
+            show_plot=True, method='pygam', splines=[100, 100], plot_cutoff=True, plot_start=25, plot_end=75
+    )
+    # Test whether mono_interp is a function
+    assert hasattr(mono_interp1, "__call__")
+
+    # Simulate some data
     N = 100
     xlo = 0.001
     xhi = 2 * np.pi
@@ -739,23 +791,22 @@ def test_monotone_spline():
     y = y0 + np.random.randn(N) * 0.5
     weights = np.ones(N)
 
-    # Estimate monotonically increasing function on the data
     (
-        mono_interp,
-        y_cstr,
-        wsse_cstr,
-        y_uncstr,
-        wsse_uncstr,
+        mono_interp2,
+        y_cstr2,
+        wsse_cstr2,
+        y_uncstr2,
+        wsse_uncstr2,
     ) = txfunc.monotone_spline(
-        x, y, weights, bins=10, lam=100, incl_uncstr=True, show_plot=False
+        x, y, weights, bins=10, lam=100, incl_uncstr=True, show_plot=False, method='eilers'
     )
     # Test whether mono_interp is a function
-    assert hasattr(mono_interp, "__call__")
+    assert hasattr(mono_interp2, "__call__")
 
     # Test whether mono_interp gives the correct output
     x_vec_test = np.array([2.0, 5.0])
     y_vec_expected = np.array([0.69512331, 1.23822669])
-    y_monointerp = mono_interp(x_vec_test)
+    y_monointerp = mono_interp2(x_vec_test, y_cstr2)
     assert np.allclose(y_monointerp, y_vec_expected)
 
     # Test that y_cstr, wsse_cstr, y_uncstr, wsse_uncstr are correct
@@ -789,7 +840,8 @@ def test_monotone_spline():
         ]
     )
     wsse_uncstr_expected = 468.4894930349361
-    assert np.allclose(y_cstr, y_cstr_expected)
-    assert np.allclose(wsse_cstr, wsse_cstr_expected)
-    assert np.allclose(y_uncstr, y_uncstr_expected)
-    assert np.allclose(wsse_uncstr, wsse_uncstr_expected)
+    assert np.allclose(y_cstr2, y_cstr_expected)
+    assert np.allclose(wsse_cstr2, wsse_cstr_expected)
+    assert np.allclose(y_uncstr2, y_uncstr_expected)
+    assert np.allclose(wsse_uncstr2, wsse_uncstr_expected)
+        

From 7efa261b57b74d29db438f3f5224cbb936b8a480 Mon Sep 17 00:00:00 2001
From: prrathi <53785742+prrathi@users.noreply.github.com>
Date: Tue, 27 Dec 2022 15:24:24 -0600
Subject: [PATCH 02/18] restore original eilers method code

---
 ogcore/txfunc.py | 53 +++++++++++++-----------------------------------
 1 file changed, 14 insertions(+), 39 deletions(-)

diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py
index 827cb18f6..b84a53dc5 100644
--- a/ogcore/txfunc.py
+++ b/ogcore/txfunc.py
@@ -1824,7 +1824,7 @@ def interp(x):
                 err_msg = "monotone_spline2 ERROR: bins value is not type scalar"
                 raise ValueError(err_msg)
             N = bins
-            x_binned, y_binned, weights_binned_ = utils.avg_by_bin(
+            x_binned, y_binned, weights_binned = utils.avg_by_bin(
                 x, y, weights, bins
             )
 
@@ -1832,7 +1832,7 @@ def interp(x):
             N = len(x)
             x_binned = x
             y_binned = y
-            weights_binned_ = weights
+            weights_binned = weights
 
         # Prepare bases (Imat) and penalty
         dd = 3
@@ -1842,7 +1842,7 @@ def interp(x):
 
         # Monotone smoothing
         ws = np.zeros(N - 1)
-        weights_binned = weights_binned_.reshape(len(weights_binned_), 1)
+        weights_binned = weights_binned.reshape(len(weights_binned), 1)
         weights1 = 0.5 * weights_binned[1:, :] + 0.5 * weights_binned[:-1, :]
         weights3 = (
             0.25 * weights_binned[3:, :]
@@ -1854,13 +1854,11 @@ def interp(x):
         for it in range(30):
             Ws = np.diag(ws * kap)
             mon_cof = np.linalg.solve(
-                E + 
-                lam * D3.T @ np.diag(weights3) @ D3 # weights3
+                E
+                + lam * D3.T @ np.diag(weights3) @ D3
                 + D1.T @ (Ws * weights1) @ D1,
                 y_binned,
             )
-            print(ws)
-            print(mon_cof)
             ws_new = (D1 @ mon_cof < 0.0) * 1
             dw = np.sum(ws != ws_new)
             ws = ws_new
@@ -1872,36 +1870,20 @@ def interp(x):
         wsse_cstr = (weights_binned * ((y_cstr - y_binned) ** 2)).sum()
         if incl_uncstr:
             y_uncstr = np.linalg.solve(
-                E + lam * D3.T @ np.diag(weights3) @ D3, y_binned # weights3
+                E + lam * D3.T @ np.diag(weights3) @ D3, y_binned
             )
             wsse_uncstr = (weights_binned * ((y_uncstr - y_binned) ** 2)).sum()
         else:
             y_uncstr = None
             wsse_uncstr = None
 
-        # breakpoint()
-        # hist = HistGradientBoostingRegressor(max_depth = 1).fit(
-            # np.array([x_binned]).T, y_binned, weights_binned_)
-        # y_cstr2 = hist.predict(np.array([x_binned]).T)
-        # wsse_cstr2 = (weights_binned * ((y_cstr2 - y_binned) ** 2)).sum()
-        # iso = MultiIsotonicRegressor()
-        # iso = iso.fit(np.array([x_binned]).T, y_binned)
-        # y_cstr2 = iso.predict(np.array([x_binned]).T)
-        # wsse_cstr2 = (weights_binned * ((y_cstr2 - y_binned) ** 2)).sum()
-
-
-
-        print("Constrained: " + str(wsse_cstr))
-        # print("Constrained2: " + str(wsse_cstr2))
-        print("unconstrained: " + str(wsse_uncstr))
-
-        def mono_interp(x_vec, y_cstr_):
+        def mono_interp(x_vec):
             # replace last point in data with two copies further out to make smooth
             # extrapolation
             x_new = np.append(
                 x_binned[:-1], [1.005 * x_binned[-1], 1.01 * x_binned[-1]]
             )
-            y_cstr_new = np.append(y_cstr_[:-1], [y_cstr_[-1], y_cstr_[-1]])
+            y_cstr_new = np.append(y_cstr[:-1], [y_cstr[-1], y_cstr[-1]])
             # Create interpolating cubic spline for interior points
             inter_interpl = intp(x_new, y_cstr_new, kind="cubic")
             y_pred = np.zeros_like(x_vec)
@@ -1910,10 +1892,10 @@ def mono_interp(x_vec, y_cstr_):
             x_inter = (x_vec >= x_binned.min()) & (x_vec <= x_new.max())
             y_pred[x_inter] = inter_interpl(x_vec[x_inter])
             # extrapolate the maximum for values above the maximum
-            y_pred[x_gt_max] = y_cstr_[-1]
+            y_pred[x_gt_max] = y_cstr[-1]
             # linear extrapolation of last two points for values below the min
-            slope = (y_cstr_[1] - y_cstr_[0]) / (x_binned[1] - x_binned[0])
-            intercept = y_cstr_[0] - slope * x_binned[0]
+            slope = (y_cstr[1] - y_cstr[0]) / (x_binned[1] - x_binned[0])
+            intercept = y_cstr[0] - slope * x_binned[0]
             y_pred[x_lt_min] = slope * x_vec[x_lt_min] + intercept
 
             return y_pred
@@ -1950,23 +1932,16 @@ def mono_interp(x_vec, y_cstr_):
                     y_binned,
                     linestyle="None",
                     color="black",
-                    s=4,
+                    s=0.8,
                     alpha=0.7,
                     label="Binned data averages",
                 )
                 plt.plot(
                     x,
-                    mono_interp(x, y_cstr),
+                    mono_interp(x),
                     color="red",
                     alpha=1.0,
-                    label="Original monotonic smooth spline",
-                )
-                plt.plot(
-                    x,
-                    ygam,
-                    color="green",
-                    alpha=1.0,
-                    label="New monotonic smooth spline",
+                    label="Monotonic smooth spline",
                 )
                 if incl_uncstr:
                     plt.plot(

From 7f3a8652e2fc0d99900c272ae7d2c8599022028e Mon Sep 17 00:00:00 2001
From: prrathi <53785742+prrathi@users.noreply.github.com>
Date: Tue, 27 Dec 2022 16:14:59 -0600
Subject: [PATCH 03/18] Update txfunc.py

---
 ogcore/txfunc.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py
index b84a53dc5..57f25808d 100644
--- a/ogcore/txfunc.py
+++ b/ogcore/txfunc.py
@@ -22,8 +22,6 @@
 from ogcore.constants import DEFAULT_START_YEAR, SHOW_RUNTIME
 from ogcore import utils
 import warnings
-from sklearn.ensemble import HistGradientBoostingRegressor
-from ogcore.isotonic import MultiIsotonicRegressor
 from pygam import LinearGAM, s, te
 from matplotlib import cm
 import random

From 0c9c48ec9b48c12dff9d7b1e9e8cd04fded3ae52 Mon Sep 17 00:00:00 2001
From: Praneet Rathi <prrathi10@gmail.com>
Date: Wed, 28 Dec 2022 09:20:58 -0600
Subject: [PATCH 04/18] update environment + other small changes

---
 tests/test_txfunc.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/test_txfunc.py b/tests/test_txfunc.py
index 41278b302..bfdb18378 100644
--- a/tests/test_txfunc.py
+++ b/tests/test_txfunc.py
@@ -777,10 +777,11 @@ def test_monotone_spline():
         wsse_uncstr1,
     ) = txfunc.monotone_spline(
         df[['total_labinc', 'total_capinc']].values, df['etr'].values, df['weight'].values, bins=[100, 100], lam=100, incl_uncstr=True, 
-            show_plot=True, method='pygam', splines=[100, 100], plot_cutoff=True, plot_start=25, plot_end=75
+            show_plot=False, method='pygam', splines=[100, 100], plot_start=25, plot_end=75
     )
     # Test whether mono_interp is a function
     assert hasattr(mono_interp1, "__call__")
+    # Not sure what baseline should be to add tests here for "correctness" of values
 
     # Simulate some data
     N = 100
@@ -806,7 +807,7 @@ def test_monotone_spline():
     # Test whether mono_interp gives the correct output
     x_vec_test = np.array([2.0, 5.0])
     y_vec_expected = np.array([0.69512331, 1.23822669])
-    y_monointerp = mono_interp2(x_vec_test, y_cstr2)
+    y_monointerp = mono_interp2(x_vec_test)
     assert np.allclose(y_monointerp, y_vec_expected)
 
     # Test that y_cstr, wsse_cstr, y_uncstr, wsse_uncstr are correct

From 1857336d49cd86c39f7bae25b1e5421e7487a249 Mon Sep 17 00:00:00 2001
From: Praneet Rathi <prrathi10@gmail.com>
Date: Wed, 28 Dec 2022 09:21:58 -0600
Subject: [PATCH 05/18] uncommitted files

---
 environment.yml  | 5 ++++-
 ogcore/txfunc.py | 6 ++----
 setup.py         | 2 ++
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/environment.yml b/environment.yml
index 14c4ad5ba..0f9c2f0f2 100644
--- a/environment.yml
+++ b/environment.yml
@@ -20,4 +20,7 @@ dependencies:
 - coverage
 - requests
 - openpyxl
-- black
\ No newline at end of file
+- black
+- pip
+- pip:
+  - pygam
\ No newline at end of file
diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py
index 57f25808d..43eda1e02 100644
--- a/ogcore/txfunc.py
+++ b/ogcore/txfunc.py
@@ -1635,7 +1635,7 @@ def tax_func_estimate(
     return dict_params
 
 
-def avg_by_bin_2d(x, y, bins, weights = None):
+def avg_by_bin_multd(x, y, bins, weights = None):
     """
     Args:
         x (numpy array): 2d with dimensions n by m used for binning
@@ -1729,7 +1729,7 @@ def monotone_spline(
         if bins == None:
             x_binned, y_binned, weights_binned = x, y, weights
         else:
-            x_binned, y_binned, weights_binned = avg_by_bin_2d(x, y, bins, weights)
+            x_binned, y_binned, weights_binned = avg_by_bin_multd(x, y, bins, weights)
 
         # setup pygam parameters- in addition to 's' spline terms, can also have 't' tensor 
         # terms which are interactions between two variables. 't' terms also need monotonic constraints
@@ -1761,8 +1761,6 @@ def monotone_spline(
         else:
             y_uncstr = None
             wsse_uncstr = None
-        print("Constrained: " + str(wsse_cstr))
-        print("Unconstrained: " + str(wsse_uncstr))
 
         if show_plot:
             if x.shape[1] == 2:
diff --git a/setup.py b/setup.py
index 2ed7fdfbb..7d133dead 100755
--- a/setup.py
+++ b/setup.py
@@ -34,6 +34,8 @@
         "distributed>=2.30.1",
         "paramtools>=0.15.0",
         "requests",
+        "pip",
+        "pygam",
     ],
     classifiers=[
         "Development Status :: 2 - Pre-Alpha",

From 9193e288e7c84c4e98e314f8a349daf3de5236c1 Mon Sep 17 00:00:00 2001
From: Praneet Rathi <prrathi10@gmail.com>
Date: Thu, 29 Dec 2022 10:52:16 -0600
Subject: [PATCH 06/18] black formatted files

---
 ogcore/txfunc.py     | 123 +++++++++++++++++++++++++++++--------------
 tests/test_txfunc.py |  46 ++++++++++------
 2 files changed, 114 insertions(+), 55 deletions(-)

diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py
index 43eda1e02..72114c0ec 100644
--- a/ogcore/txfunc.py
+++ b/ogcore/txfunc.py
@@ -1635,28 +1635,30 @@ def tax_func_estimate(
     return dict_params
 
 
-def avg_by_bin_multd(x, y, bins, weights = None):
+def avg_by_bin_multd(x, y, bins, weights=None):
     """
     Args:
         x (numpy array): 2d with dimensions n by m used for binning
-        y (numpy array): 1d with length n 
+        y (numpy array): 1d with length n
         bins (numpy array): 1d with length m, each entry must divide n
             and is number of bins for corresponding column in x
-        weights (None or numpy array): 1d with length n specifying 
+        weights (None or numpy array): 1d with length n specifying
             weight of each observation. if None then array of ones
 
     Returns:
-        xNew (numpy array): 2d with second dimension m, first 
-            dimension is product of elements in bins, with each entry 
+        xNew (numpy array): 2d with second dimension m, first
+            dimension is product of elements in bins, with each entry
             representative of bin across all the features
         yNew (numpy array): 1d with length same as first dimension
-            of xWeight, weighted average of y's corresponding to each 
+            of xWeight, weighted average of y's corresponding to each
             entry of xWeight
         weightsNew (numpy array): 1d with length same as yNew, weight
             corresponding to each xNew, yNew row
     """
     if x.shape[1] != len(bins):
-        message = "Dimensions of x and bins don't match: {} != {}".format(x.shape[1], len(bins))
+        message = "Dimensions of x and bins don't match: {} != {}".format(
+            x.shape[1], len(bins)
+        )
         raise ValueError(message)
     else:
         size = np.prod(bins)
@@ -1671,9 +1673,13 @@ def avg_by_bin_multd(x, y, bins, weights = None):
             index = list(np.unravel_index(i, tupleBins))
             valid = np.ones(x.shape[0], dtype=bool)
             for j, v in enumerate(index):
-                valid &= (x[:, j] >= np.percentile(x[:, j], v*100/bins[j])) & (x[:, j] < np.percentile(x[:, j], (v+1)*100/bins[j]))
+                valid &= (
+                    x[:, j] >= np.percentile(x[:, j], v * 100 / bins[j])
+                ) & (x[:, j] < np.percentile(x[:, j], (v + 1) * 100 / bins[j]))
             if np.sum(valid) != 0:
-                xNew[i, :] = np.average(x[valid], axis=0, weights=weights[valid])
+                xNew[i, :] = np.average(
+                    x[valid], axis=0, weights=weights[valid]
+                )
                 yNew[i] = np.average(y[valid], axis=0, weights=weights[valid])
                 weightsNew[i] = np.sum(weights[valid])
         xNew = xNew[~(weightsNew == 0)]
@@ -1691,37 +1697,39 @@ def monotone_spline(
     kap=1e7,
     incl_uncstr=False,
     show_plot=False,
-    method='eilers',
+    method="eilers",
     splines=None,
     plot_start=0,
-    plot_end=100
+    plot_end=100,
 ):
     """
     New args:
         method (string): 'eilers' (old version) or 'pygam' (new version)
-        splines (None or array-like): for 'pygam' only (otherwise set None), 
+        splines (None or array-like): for 'pygam' only (otherwise set None),
             number of splines used for each feature, if None use default
-        plot_start/plot_end (number between 0, 100): for 'pygam' only if show_plot = True, 
+        plot_start/plot_end (number between 0, 100): for 'pygam' only if show_plot = True,
             start and end for percentile of data used in plot, can result in
             better visualizations if original data has strong outliers
 
     Returns:
-        xNew (numpy array): 2d with second dimension m, first 
-            dimension is product of elements in bins, with each entry 
+        xNew (numpy array): 2d with second dimension m, first
+            dimension is product of elements in bins, with each entry
             representative of bin across all the features
         yNew (numpy array): 1d with length same as first dimension
-            of xWeight, weighted average of y's corresponding to each 
+            of xWeight, weighted average of y's corresponding to each
             entry of xWeight
         weightsNew (numpy array): 1d with length same as yNew, weight
             corresponding to each xNew, yNew row
     """
 
-    if method == 'pygam':
+    if method == "pygam":
         if splines != None and len(splines) != x.shape[1]:
             err_msg = (
-                " pygam method requires splines to be None or " + 
-                " same length as # of columns in x, " + len(splines)
-                 + " != " + str(x.shape[1])
+                " pygam method requires splines to be None or "
+                + " same length as # of columns in x, "
+                + len(splines)
+                + " != "
+                + str(x.shape[1])
             )
             raise ValueError(err_msg)
 
@@ -1729,23 +1737,27 @@ def monotone_spline(
         if bins == None:
             x_binned, y_binned, weights_binned = x, y, weights
         else:
-            x_binned, y_binned, weights_binned = avg_by_bin_multd(x, y, bins, weights)
+            x_binned, y_binned, weights_binned = avg_by_bin_multd(
+                x, y, bins, weights
+            )
 
-        # setup pygam parameters- in addition to 's' spline terms, can also have 't' tensor 
+        # setup pygam parameters- in addition to 's' spline terms, can also have 't' tensor
         # terms which are interactions between two variables. 't' terms also need monotonic constraints
         # to satisfy previous constraints, they actually impose stronger restriction
 
         if splines == None:
-            tempCstr = s(0, constraints='monotonic_inc')
+            tempCstr = s(0, constraints="monotonic_inc")
             for i in range(1, x_binned.shape[1]):
-                tempCstr += s(i, constraints='monotonic_inc')
+                tempCstr += s(i, constraints="monotonic_inc")
             tempUncstr = s(0)
             for i in range(1, x_binned.shape[1]):
                 tempUncstr += s(i)
         else:
-            tempCstr = s(0, constraints='monotonic_inc', n_splines=splines[0])
+            tempCstr = s(0, constraints="monotonic_inc", n_splines=splines[0])
             for i in range(1, x_binned.shape[1]):
-                tempCstr += s(i, constraints='monotonic_inc', n_splines=splines[i])
+                tempCstr += s(
+                    i, constraints="monotonic_inc", n_splines=splines[i]
+                )
             tempUncstr = s(0, n_splines=splines[0])
             for i in range(1, x_binned.shape[1]):
                 tempUncstr += s(i, n_splines=splines[i])
@@ -1755,7 +1767,9 @@ def monotone_spline(
         y_cstr = gamCstr.predict(x_binned)
         wsse_cstr = (weights_binned * ((y_cstr - y_binned) ** 2)).sum()
         if incl_uncstr:
-            gamUncstr = LinearGAM(tempUncstr).fit(x_binned, y_binned, weights_binned)
+            gamUncstr = LinearGAM(tempUncstr).fit(
+                x_binned, y_binned, weights_binned
+            )
             y_uncstr = gamUncstr.predict(x_binned)
             wsse_uncstr = (weights_binned * ((y_uncstr - y_binned) ** 2)).sum()
         else:
@@ -1766,18 +1780,45 @@ def monotone_spline(
             if x.shape[1] == 2:
                 fig, ax = plt.subplots(subplot_kw={"projection": "3d"})
                 # select data in [plot_start, end] percentile across both features
-                # this can be rewritten to generalize for n-dimensions, but didn't know how to plot that 
-                xactPlot = x[(x[:,0] >= np.percentile(x[:,0], plot_start)) & (x[:,0] <= np.percentile(x[:,0], plot_end)) & 
-                    (x[:,1] >= np.percentile(x[:,1], plot_start)) & (x[:,1] <= np.percentile(x[:,1], plot_end))]
-                yactPlot = y[(x[:,0] >= np.percentile(x[:,0], plot_start)) & (x[:,0] <= np.percentile(x[:,0], plot_end)) & 
-                    (x[:,1] >= np.percentile(x[:,1], plot_start)) & (x[:,1] <= np.percentile(x[:,1], plot_end))]
-                ax.scatter(xactPlot[:, 0], xactPlot[:, 1], yactPlot, color="black", s=0.8, alpha=0.25)
-
-                x0 = np.linspace(np.percentile(x[:,0], plot_start),np.percentile(x[:,0], plot_end), 1000)
-                x1 = np.linspace(np.percentile(x[:,1], plot_start),np.percentile(x[:,1], plot_end), 1000)
+                # this can be rewritten to generalize for n-dimensions, but didn't know how to plot that
+                xactPlot = x[
+                    (x[:, 0] >= np.percentile(x[:, 0], plot_start))
+                    & (x[:, 0] <= np.percentile(x[:, 0], plot_end))
+                    & (x[:, 1] >= np.percentile(x[:, 1], plot_start))
+                    & (x[:, 1] <= np.percentile(x[:, 1], plot_end))
+                ]
+                yactPlot = y[
+                    (x[:, 0] >= np.percentile(x[:, 0], plot_start))
+                    & (x[:, 0] <= np.percentile(x[:, 0], plot_end))
+                    & (x[:, 1] >= np.percentile(x[:, 1], plot_start))
+                    & (x[:, 1] <= np.percentile(x[:, 1], plot_end))
+                ]
+                ax.scatter(
+                    xactPlot[:, 0],
+                    xactPlot[:, 1],
+                    yactPlot,
+                    color="black",
+                    s=0.8,
+                    alpha=0.25,
+                )
+
+                x0 = np.linspace(
+                    np.percentile(x[:, 0], plot_start),
+                    np.percentile(x[:, 0], plot_end),
+                    1000,
+                )
+                x1 = np.linspace(
+                    np.percentile(x[:, 1], plot_start),
+                    np.percentile(x[:, 1], plot_end),
+                    1000,
+                )
                 X0, X1 = np.meshgrid(x0, x1)
-                yPred = gamCstr.predict(np.array([X0.flatten(), X1.flatten()]).T)
-                ax.plot_surface(X0, X1, yPred.reshape(x0.shape[0], -1), color = 'red')
+                yPred = gamCstr.predict(
+                    np.array([X0.flatten(), X1.flatten()]).T
+                )
+                ax.plot_surface(
+                    X0, X1, yPred.reshape(x0.shape[0], -1), color="red"
+                )
                 ax.set_label("Monotonic GAM spline with all data")
 
             if x.shape[1] == 1:
@@ -1813,11 +1854,13 @@ def interp(x):
 
         return interp, y_cstr, wsse_cstr, y_uncstr, wsse_uncstr
 
-    if method == 'eilers':
+    if method == "eilers":
         # create binned and weighted x and y data
         if bins:
             if not np.isscalar(bins):
-                err_msg = "monotone_spline2 ERROR: bins value is not type scalar"
+                err_msg = (
+                    "monotone_spline2 ERROR: bins value is not type scalar"
+                )
                 raise ValueError(err_msg)
             N = bins
             x_binned, y_binned, weights_binned = utils.avg_by_bin(
diff --git a/tests/test_txfunc.py b/tests/test_txfunc.py
index bfdb18378..5bf43efa8 100644
--- a/tests/test_txfunc.py
+++ b/tests/test_txfunc.py
@@ -1,5 +1,6 @@
 import sys
-sys.path.append('/Users/praneet/developer/OG-Core/')
+
+sys.path.append("/Users/praneet/developer/OG-Core/")
 from ogcore import txfunc
 from distributed import Client, LocalCluster
 import pytest
@@ -735,7 +736,7 @@ def test_monotone_spline():
     np.random.seed(10)
 
     # another test case for pygam method:
-    
+
     # N = 100
     # xlo = 0.001
     # xhi = 2 * np.pi
@@ -753,18 +754,18 @@ def test_monotone_spline():
     # y = y0 + np.random.random(y0.shape) * 0.2
     # weights = np.ones(10000)
     # (
-        # mono_interp,
-        # y_cstr,
-        # wsse_cstr,
-        # y_uncstr,
-        # wsse_uncstr,
+    # mono_interp,
+    # y_cstr,
+    # wsse_cstr,
+    # y_uncstr,
+    # wsse_uncstr,
     # ) = txfunc.monotone_spline(
-        # X, y, weights, lam=100, incl_uncstr=True, show_plot=True, method='pygam'
+    # X, y, weights, lam=100, incl_uncstr=True, show_plot=True, method='pygam'
     # )
 
-    with open('test_io_data/micro_data_dict_for_tests.pkl', 'rb') as f:
-        data = pickle.load(f)['2030']
-    df = data[['total_labinc', 'total_capinc', 'etr', 'weight']]
+    with open("test_io_data/micro_data_dict_for_tests.pkl", "rb") as f:
+        data = pickle.load(f)["2030"]
+    df = data[["total_labinc", "total_capinc", "etr", "weight"]]
     df.replace([np.inf, -np.inf], np.nan, inplace=True)
     df.dropna(inplace=True)
 
@@ -776,8 +777,17 @@ def test_monotone_spline():
         y_uncstr1,
         wsse_uncstr1,
     ) = txfunc.monotone_spline(
-        df[['total_labinc', 'total_capinc']].values, df['etr'].values, df['weight'].values, bins=[100, 100], lam=100, incl_uncstr=True, 
-            show_plot=False, method='pygam', splines=[100, 100], plot_start=25, plot_end=75
+        df[["total_labinc", "total_capinc"]].values,
+        df["etr"].values,
+        df["weight"].values,
+        bins=[100, 100],
+        lam=100,
+        incl_uncstr=True,
+        show_plot=False,
+        method="pygam",
+        splines=[100, 100],
+        plot_start=25,
+        plot_end=75,
     )
     # Test whether mono_interp is a function
     assert hasattr(mono_interp1, "__call__")
@@ -799,7 +809,14 @@ def test_monotone_spline():
         y_uncstr2,
         wsse_uncstr2,
     ) = txfunc.monotone_spline(
-        x, y, weights, bins=10, lam=100, incl_uncstr=True, show_plot=False, method='eilers'
+        x,
+        y,
+        weights,
+        bins=10,
+        lam=100,
+        incl_uncstr=True,
+        show_plot=False,
+        method="eilers",
     )
     # Test whether mono_interp is a function
     assert hasattr(mono_interp2, "__call__")
@@ -845,4 +862,3 @@ def test_monotone_spline():
     assert np.allclose(wsse_cstr2, wsse_cstr_expected)
     assert np.allclose(y_uncstr2, y_uncstr_expected)
     assert np.allclose(wsse_uncstr2, wsse_uncstr_expected)
-        

From 30cb743bdf3c4a6075d1cfa5759102c924420160 Mon Sep 17 00:00:00 2001
From: prrathi <53785742+prrathi@users.noreply.github.com>
Date: Wed, 1 Mar 2023 20:27:23 -0600
Subject: [PATCH 07/18] remove my local path

---
 tests/test_txfunc.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_txfunc.py b/tests/test_txfunc.py
index bf54fae2a..76300c85a 100644
--- a/tests/test_txfunc.py
+++ b/tests/test_txfunc.py
@@ -1,6 +1,5 @@
 import sys
 
-sys.path.append("/Users/praneet/developer/OG-Core/")
 from ogcore import txfunc
 from distributed import Client, LocalCluster
 import pytest

From 6887d02ed6165199537cd9ebe73480e1fd8c99b6 Mon Sep 17 00:00:00 2001
From: prrathi <53785742+prrathi@users.noreply.github.com>
Date: Thu, 2 Mar 2023 21:44:20 -0600
Subject: [PATCH 08/18] suggested file read changes

---
 tests/test_txfunc.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/test_txfunc.py b/tests/test_txfunc.py
index 76300c85a..5b83d983f 100644
--- a/tests/test_txfunc.py
+++ b/tests/test_txfunc.py
@@ -762,8 +762,9 @@ def test_monotone_spline():
     # X, y, weights, lam=100, incl_uncstr=True, show_plot=True, method='pygam'
     # )
 
-    with open("test_io_data/micro_data_dict_for_tests.pkl", "rb") as f:
-        data = pickle.load(f)["2030"]
+    data = utils.safe_read_pickle(
+        os.path.join(CUR_PATH, "test_io_data", "micro_data_dict_for_tests.pkl")
+    )["2030"]
     df = data[["total_labinc", "total_capinc", "etr", "weight"]]
     df.replace([np.inf, -np.inf], np.nan, inplace=True)
     df.dropna(inplace=True)

From 3de3c813770dfd28d8fb0fe8ffee3877a4bba403 Mon Sep 17 00:00:00 2001
From: Praneet Rathi <prrathi10@gmail.com>
Date: Sat, 4 Mar 2023 14:00:15 -0600
Subject: [PATCH 09/18] add 2dmono option

---
 ogcore/default_parameters.json |  3 ++-
 ogcore/txfunc.py               | 22 +++++++++++++++++++---
 tests/test_txfunc.py           |  5 +++--
 3 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/ogcore/default_parameters.json b/ogcore/default_parameters.json
index 61af73915..e8f5ed580 100644
--- a/ogcore/default_parameters.json
+++ b/ogcore/default_parameters.json
@@ -3146,7 +3146,8 @@
                     "DEP_totalinc",
                     "GS",
                     "linear",
-                    "mono"
+                    "mono",
+                    "mono2D"
                 ]
             }
         }
diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py
index d5d60d42c..bc4beefc2 100644
--- a/ogcore/txfunc.py
+++ b/ogcore/txfunc.py
@@ -253,7 +253,9 @@ def get_tax_rates(
     elif tax_func_type == "mono":
         mono_interp = params[0]
         txrates = mono_interp(income)
-
+    elif tax_func_type == "mono2D":
+        mono_interp = params[0]
+        txrates = mono_interp([[X, Y]])
     return txrates
 
 
@@ -691,11 +693,23 @@ def txfunc_est(
         wsse = wsse_cstr
         params = mono_interp
         params_to_plot = params
+    elif tax_func_type == "mono2D":
+        mono_interp, _, wsse_cstr, _, _ = monotone_spline(
+            df[["total_labinc", "total_capinc"]].values,
+            df["etr"].values,
+            df["weight"].values,
+            bins=[100, 100],
+            method="pygam",
+            splines=[100, 100],
+        )
+        wsse = wsse_cstr
+        params = mono_interp
+        params_to_plot = params
     else:
         raise RuntimeError(
             "Choice of tax function is not in the set of"
             + " possible tax functions.  Please select"
-            + " from: DEP, DEP_totalinc, GS, linear."
+            + " from: DEP, DEP_totalinc, GS, linear, mono, mono2D."
         )
     if graph:
         pp.txfunc_graph(
@@ -1722,11 +1736,13 @@ def monotone_spline(
     """
 
     if method == "pygam":
+        if len(x.shape) == 1:
+            x = np.expand_dims(x, axis=1)
         if splines != None and len(splines) != x.shape[1]:
             err_msg = (
                 " pygam method requires splines to be None or "
                 + " same length as # of columns in x, "
-                + len(splines)
+                + str(len(splines))
                 + " != "
                 + str(x.shape[1])
             )
diff --git a/tests/test_txfunc.py b/tests/test_txfunc.py
index 5b83d983f..f4d76e966 100644
--- a/tests/test_txfunc.py
+++ b/tests/test_txfunc.py
@@ -765,7 +765,7 @@ def test_monotone_spline():
     data = utils.safe_read_pickle(
         os.path.join(CUR_PATH, "test_io_data", "micro_data_dict_for_tests.pkl")
     )["2030"]
-    df = data[["total_labinc", "total_capinc", "etr", "weight"]]
+    df = data[["total_labinc", "total_capinc", "etr", "weight"]].copy()
     df.replace([np.inf, -np.inf], np.nan, inplace=True)
     df.dropna(inplace=True)
 
@@ -781,7 +781,6 @@ def test_monotone_spline():
         df["etr"].values,
         df["weight"].values,
         bins=[100, 100],
-        lam=100,
         incl_uncstr=True,
         show_plot=False,
         method="pygam",
@@ -821,6 +820,8 @@ def test_monotone_spline():
     # Test whether mono_interp is a function
     assert hasattr(mono_interp2, "__call__")
 
+    # Test whether mono_interp is a function
+    assert hasattr(mono_interp2, "__call__")
     # Test whether mono_interp gives the correct output
     x_vec_test = np.array([2.0, 5.0])
     y_vec_expected = np.array([0.69512331, 1.23822669])

From f96dae545621a3b9f4bb638eea269617884cd17b Mon Sep 17 00:00:00 2001
From: Praneet Rathi <prrathi10@gmail.com>
Date: Sat, 4 Mar 2023 14:25:52 -0600
Subject: [PATCH 10/18] small bug

---
 tests/test_parameters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_parameters.py b/tests/test_parameters.py
index 741721fd0..b708638e7 100644
--- a/tests/test_parameters.py
+++ b/tests/test_parameters.py
@@ -157,7 +157,7 @@ def test_implement_bad_reform2():
     assert len(specs.errors) > 0
     assert specs.errors["tax_func_type"][0] == (
         'tax_func_type "not_a_functional_form" must be in list of '
-        + "choices DEP, DEP_totalinc, GS, linear, mono."
+        + "choices DEP, DEP_totalinc, GS, linear, mono, mono2D."
     )
 
 

From ed3ec31105b2445b5e123f8f9d403eeca340f505 Mon Sep 17 00:00:00 2001
From: jdebacker <jason.debacker@gmail.com>
Date: Thu, 9 Mar 2023 21:34:38 -0500
Subject: [PATCH 11/18] return obs

---
 ogcore/txfunc.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py
index bc4beefc2..9c48e01c8 100644
--- a/ogcore/txfunc.py
+++ b/ogcore/txfunc.py
@@ -694,6 +694,7 @@ def txfunc_est(
         params = mono_interp
         params_to_plot = params
     elif tax_func_type == "mono2D":
+        obs = df.shape[0]
         mono_interp, _, wsse_cstr, _, _ = monotone_spline(
             df[["total_labinc", "total_capinc"]].values,
             df["etr"].values,
@@ -1328,6 +1329,7 @@ def tax_func_estimate(
         "GS": 3,
         "linear": 1,
         "mono": 1,
+        "mono2D": 1,
     }
     numparams = int(tax_func_type_num_params_dict[tax_func_type])
     years_list = np.arange(start_year, end_yr + 1)

From 5171b3d28f197d9351e6720aff2cc14aa9b5a7c4 Mon Sep 17 00:00:00 2001
From: jdebacker <jason.debacker@gmail.com>
Date: Fri, 10 Mar 2023 12:22:15 -0500
Subject: [PATCH 12/18] use cloudpickle

---
 ogcore/txfunc.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py
index 9c48e01c8..784176c81 100644
--- a/ogcore/txfunc.py
+++ b/ogcore/txfunc.py
@@ -16,6 +16,7 @@
 from dask import delayed, compute
 import dask.multiprocessing
 import pickle
+import cloudpickle
 from scipy.interpolate import interp1d as intp
 import matplotlib.pyplot as plt
 import ogcore.parameter_plots as pp
@@ -696,9 +697,12 @@ def txfunc_est(
     elif tax_func_type == "mono2D":
         obs = df.shape[0]
         mono_interp, _, wsse_cstr, _, _ = monotone_spline(
-            df[["total_labinc", "total_capinc"]].values,
-            df["etr"].values,
-            df["weight"].values,
+            # df[["total_labinc", "total_capinc"]].values,
+            # df["etr"].values,
+            # df["weight"].values,
+            np.vstack((X,Y)).T,
+            # X, Y,
+            txrates, wgts,
             bins=[100, 100],
             method="pygam",
             splines=[100, 100],
@@ -1644,8 +1648,12 @@ def tax_func_estimate(
     )
 
     if tax_func_path:
-        with open(tax_func_path, "wb") as f:
-            pickle.dump(dict_params, f)
+        try:
+            with open(tax_func_path, "wb") as f:
+                pickle.dump(dict_params, f)
+        except AttributeError:
+            with open(tax_func_path, "wb") as f:
+                cloudpickle.dump(dict_params, f)
 
     return dict_params
 

From 71e354d40c172e1363a16f059e62fa396b3b16c9 Mon Sep 17 00:00:00 2001
From: prrathi <53785742+prrathi@users.noreply.github.com>
Date: Mon, 20 Mar 2023 16:49:39 -0500
Subject: [PATCH 13/18] Update txfunc.py

---
 ogcore/txfunc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py
index 784176c81..ef9e31f1c 100644
--- a/ogcore/txfunc.py
+++ b/ogcore/txfunc.py
@@ -1727,7 +1727,7 @@ def monotone_spline(
 ):
     """
     New args:
-        method (string): 'eilers' (old version) or 'pygam' (new version)
+        method (string): 'eilers' or 'pygam'
         splines (None or array-like): for 'pygam' only (otherwise set None),
             number of splines used for each feature, if None use default
         plot_start/plot_end (number between 0, 100): for 'pygam' only if show_plot = True,

From f8302d4b518ebb8d57e5a92c6e8ff487a860af90 Mon Sep 17 00:00:00 2001
From: prrathi <53785742+prrathi@users.noreply.github.com>
Date: Mon, 20 Mar 2023 16:50:40 -0500
Subject: [PATCH 14/18] Update txfunc.py

---
 ogcore/txfunc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py
index ef9e31f1c..78804935a 100644
--- a/ogcore/txfunc.py
+++ b/ogcore/txfunc.py
@@ -1726,7 +1726,7 @@ def monotone_spline(
     plot_end=100,
 ):
     """
-    New args:
+    Args:
         method (string): 'eilers' or 'pygam'
         splines (None or array-like): for 'pygam' only (otherwise set None),
             number of splines used for each feature, if None use default

From 310670fc932947a4bea4b51658c8815173475b55 Mon Sep 17 00:00:00 2001
From: Praneet Rathi <prrathi10@gmail.com>
Date: Sun, 16 Apr 2023 17:21:44 -0500
Subject: [PATCH 15/18] black formatting + txfunc update

---
 ogcore/txfunc.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 58 insertions(+), 2 deletions(-)

diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py
index 4234d9a61..3b5bcf0b4 100644
--- a/ogcore/txfunc.py
+++ b/ogcore/txfunc.py
@@ -307,8 +307,64 @@ def get_tax_rates(
                 ]
         txrates = np.array(txrates)
     elif tax_func_type == "mono2D":
-        mono_interp = params[0]
-        txrates = mono_interp([[X, Y]])
+        if for_estimation:
+            mono_interp = params[0]
+            txrates = mono_interp([[X, Y]])
+        else:
+            if np.isscalar(X) and np.isscalar(Y):
+                txrates = params[0]([[X, Y]])
+            elif X.ndim == 1 and np.isscalar(Y):
+                if (X.shape[0] == len(params)) and (
+                    len(params) > 1
+                ):  # for case where loops over S
+                    txrates = [
+                        params[s][0]([[X[s], Y]]) for s in range(income.shape[0])
+                    ]
+                else:
+                    txrates = [
+                        params[0](income[i]) for i in range(income.shape[0])
+                    ]
+            elif np.isscalar(X) and Y.ndim == 1:
+                if (Y.shape[0] == len(params)) and (
+                    len(params) > 1
+                ):  # for case where loops over S
+                    txrates = [
+                        params[s][0]([[X, Y[s]]]) for s in range(income.shape[0])
+                    ]
+                else:
+                    txrates = [
+                        params[0](income[i]) for i in range(income.shape[0])
+                    ]
+            elif X.ndim == 1 and Y.ndim == 1:
+                if (X.shape[0] == Y.shape[0] == len(params)) and (len(params) > 1):
+                    txrates = [
+                        params[s][0]([[X[s], Y[s]]])
+                        for s in range(X.shape[0])
+                    ]
+                else:
+                    txrates = [
+                        params[0]([[X[i], Y[i]]]) for i in range(X.shape[0])
+                    ]
+            elif X.ndim == 2 and Y.ndim == 2:
+                txrates = [
+                    [
+                        params[s][j][0]([[X[s, j], Y[s, j]]])
+                        for j in range(X.shape[1])
+                    ]
+                    for s in range(X.shape[0])
+                ]
+            else:  # to catch 3D arrays, looping over T, S, J
+                txrates = [
+                    [
+                        [
+                            params[t][s][j][0]([[X[t, s, j], Y[t, s, j]]])
+                            for j in range(income.shape[2])
+                        ]
+                        for s in range(income.shape[1])
+                    ]
+                    for t in range(income.shape[0])
+                ]
+        txrates = np.array(txrates)
         
     return txrates
 

From fa2362275c3ca18ef34d43cc1adbc3d96d58bc4e Mon Sep 17 00:00:00 2001
From: Praneet Rathi <prrathi10@gmail.com>
Date: Sun, 16 Apr 2023 17:22:20 -0500
Subject: [PATCH 16/18] black

---
 ogcore/txfunc.py | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py
index 3b5bcf0b4..ebf8c2f55 100644
--- a/ogcore/txfunc.py
+++ b/ogcore/txfunc.py
@@ -318,7 +318,8 @@ def get_tax_rates(
                     len(params) > 1
                 ):  # for case where loops over S
                     txrates = [
-                        params[s][0]([[X[s], Y]]) for s in range(income.shape[0])
+                        params[s][0]([[X[s], Y]])
+                        for s in range(income.shape[0])
                     ]
                 else:
                     txrates = [
@@ -329,17 +330,19 @@ def get_tax_rates(
                     len(params) > 1
                 ):  # for case where loops over S
                     txrates = [
-                        params[s][0]([[X, Y[s]]]) for s in range(income.shape[0])
+                        params[s][0]([[X, Y[s]]])
+                        for s in range(income.shape[0])
                     ]
                 else:
                     txrates = [
                         params[0](income[i]) for i in range(income.shape[0])
                     ]
             elif X.ndim == 1 and Y.ndim == 1:
-                if (X.shape[0] == Y.shape[0] == len(params)) and (len(params) > 1):
+                if (X.shape[0] == Y.shape[0] == len(params)) and (
+                    len(params) > 1
+                ):
                     txrates = [
-                        params[s][0]([[X[s], Y[s]]])
-                        for s in range(X.shape[0])
+                        params[s][0]([[X[s], Y[s]]]) for s in range(X.shape[0])
                     ]
                 else:
                     txrates = [
@@ -365,7 +368,7 @@ def get_tax_rates(
                     for t in range(income.shape[0])
                 ]
         txrates = np.array(txrates)
-        
+
     return txrates
 
 
@@ -809,9 +812,10 @@ def txfunc_est(
             # df[["total_labinc", "total_capinc"]].values,
             # df["etr"].values,
             # df["weight"].values,
-            np.vstack((X,Y)).T,
+            np.vstack((X, Y)).T,
             # X, Y,
-            txrates, wgts,
+            txrates,
+            wgts,
             bins=[100, 100],
             method="pygam",
             splines=[100, 100],
@@ -1841,7 +1845,7 @@ def monotone_spline(
         plot_start/plot_end (number between 0, 100): for 'pygam' only if show_plot = True,
             start and end for percentile of data used in plot, can result in
             better visualizations if original data has strong outliers
-    
+
 
     Returns:
         xNew (numpy array): 2d with second dimension m, first
@@ -1853,7 +1857,6 @@ def monotone_spline(
         weightsNew (numpy array): 1d with length same as yNew, weight
             corresponding to each xNew, yNew row
     """
-   
 
     if method == "pygam":
         if len(x.shape) == 1:
@@ -1993,10 +1996,14 @@ def interp(x):
         # create binned and weighted x and y data
         if bins:
             if not np.isscalar(bins):
-                err_msg = "monotone_spline2 ERROR: bins value is not type scalar"
+                err_msg = (
+                    "monotone_spline2 ERROR: bins value is not type scalar"
+                )
                 raise ValueError(err_msg)
             N = int(bins)
-            x_binned, y_binned, weights_binned = utils.avg_by_bin(x, y, weights, N)
+            x_binned, y_binned, weights_binned = utils.avg_by_bin(
+                x, y, weights, N
+            )
 
         elif not bins:
             N = len(x)

From 3ca2f4afe5829e955cb12ce5a92eae54a2f122a9 Mon Sep 17 00:00:00 2001
From: Richard Evans <rickecon@gmail.com>
Date: Fri, 21 Apr 2023 16:17:32 -0600
Subject: [PATCH 17/18] Made jdebacker updates to txfunc.py

---
 ogcore/txfunc.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ogcore/txfunc.py b/ogcore/txfunc.py
index ebf8c2f55..c58553773 100644
--- a/ogcore/txfunc.py
+++ b/ogcore/txfunc.py
@@ -367,7 +367,7 @@ def get_tax_rates(
                     ]
                     for t in range(income.shape[0])
                 ]
-        txrates = np.array(txrates)
+        txrates = np.squeeze(np.array(txrates))
 
     return txrates
 
@@ -821,7 +821,7 @@ def txfunc_est(
             splines=[100, 100],
         )
         wsse = wsse_cstr
-        params = mono_interp
+        params = [mono_interp]
         params_to_plot = params
     else:
         raise RuntimeError(

From 1c93aff7a5d2340468c92c715ee68b013597cca3 Mon Sep 17 00:00:00 2001
From: Richard Evans <rickecon@gmail.com>
Date: Fri, 21 Apr 2023 16:19:13 -0600
Subject: [PATCH 18/18] Updated version number in __init__.py and setup.py

---
 ogcore/__init__.py | 2 +-
 setup.py           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ogcore/__init__.py b/ogcore/__init__.py
index 6b1322859..cc94f5a10 100644
--- a/ogcore/__init__.py
+++ b/ogcore/__init__.py
@@ -19,4 +19,4 @@
 from ogcore.txfunc import *
 from ogcore.utils import *
 
-__version__ = "0.10.7"
+__version__ = "0.10.8"
diff --git a/setup.py b/setup.py
index 9e5e104c7..1f625de4c 100755
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="ogcore",
-    version="0.10.7",
+    version="0.10.8",
     author="Jason DeBacker and Richard W. Evans",
     license="CC0 1.0 Universal (CC0 1.0) Public Domain Dedication",
     description="A general equilibribum overlapping generations model for fiscal policy analysis",