From 273187b75d790b1d41796d82c36545894a44c8dd Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Sat, 25 Jan 2020 09:41:51 -0300 Subject: [PATCH 01/24] update from master --- pymc3/distributions/__init__.py | 4 + pymc3/distributions/bart.py | 263 +++++++++++++++++ pymc3/distributions/tree.py | 508 ++++++++++++++++++++++++++++++++ pymc3/model.py | 35 +-- pymc3/sampling.py | 22 +- pymc3/step_methods/__init__.py | 2 + pymc3/step_methods/hmc/nuts.py | 6 +- pymc3/step_methods/pgbart.py | 214 ++++++++++++++ 8 files changed, 1009 insertions(+), 45 deletions(-) create mode 100644 pymc3/distributions/bart.py create mode 100644 pymc3/distributions/tree.py create mode 100644 pymc3/step_methods/pgbart.py diff --git a/pymc3/distributions/__init__.py b/pymc3/distributions/__init__.py index d396d61dd6d..8efb515f212 100644 --- a/pymc3/distributions/__init__.py +++ b/pymc3/distributions/__init__.py @@ -98,8 +98,11 @@ from .timeseries import MvGaussianRandomWalk from .timeseries import MvStudentTRandomWalk +from .bart import BART + from .bound import Bound + __all__ = [ "Uniform", "Flat", @@ -175,4 +178,5 @@ "Moyal", "Simulator", "fast_sample_posterior_predictive", + "BART", ] diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py new file mode 100644 index 00000000000..b48e3d5b165 --- /dev/null +++ b/pymc3/distributions/bart.py @@ -0,0 +1,263 @@ +import numpy as np +from .distribution import NoDistribution +from .tree import Tree, SplitNode, LeafNode +from pymc3.util import get_variable_name + +# from ..step_methods.bart.exceptions import ( +# BARTParamsError, +# ) + +# __all__ = ["BART"] + + +class BARTParamsError(Exception): + """Base (catch-all) BART hyper parameters exception.""" + + +class BaseBART(NoDistribution): + def __init__(self, X, Y, m=200, alpha=0.95, beta=2.0, cache_size=5000, *args, **kwargs): + + self.Y_shared = Y + self.X = X + self.Y = Y.eval() + super().__init__( + shape=X.shape[0], dtype="float64", testval=0, *args, **kwargs + ) # FIXME dtype and testvalue are nonsensical + + if not isinstance(self.X, np.ndarray) or self.X.dtype.type is not np.float64: + raise BARTParamsError( + "The design matrix X type must be numpy.ndarray where every item" + " type is numpy.float64" + ) + if self.X.ndim != 2: + raise BARTParamsError("The design matrix X must have two dimensions") + if not isinstance(self.Y, np.ndarray) or self.Y.dtype.type is not np.float64: + raise BARTParamsError( + "The response matrix Y type must be numpy.ndarray where every item" + " type is numpy.float64" + ) + if self.Y.ndim != 1: + raise BARTParamsError("The response matrix Y must have one dimension") + if self.X.shape[0] != self.Y.shape[0]: + raise BARTParamsError( + "The design matrix X and the response matrix Y must have the same number of elements" + ) + if not isinstance(m, int): + raise BARTParamsError("The number of trees m type must be int") + if m < 1: + raise BARTParamsError("The number of trees m must be greater than zero") + if not isinstance(alpha, float): + raise BARTParamsError( + "The type for the alpha parameter for the tree structure must be float" + ) + if alpha <= 0 or 1 <= alpha: + raise BARTParamsError( + "The value for the alpha parameter for the tree structure " + "must be in the interval (0, 1)" + ) + if not isinstance(beta, float): + raise BARTParamsError( + "The type for the beta parameter for the tree structure must be float" + ) + if beta < 0: + raise BARTParamsError( + "The value for the beta parameter for the tree structure " + 'must be in the interval [0, float("inf"))' + ) + + self.num_observations = X.shape[0] + self.number_variates = X.shape[1] + self.m = m + self.alpha = alpha + self.beta = beta + self._normal_dist_sampler = NormalDistributionSampler(cache_size) + self._disc_uniform_dist_sampler = DiscreteUniformDistributionSampler(cache_size) + self.trees = self.init_list_of_trees() + + def init_list_of_trees(self): + initial_value_leaf_nodes = self.Y.mean() / self.m + initial_idx_data_points_leaf_nodes = np.array(range(self.num_observations), dtype="int32") + list_of_trees = [] + for i in range(self.m): + new_tree = Tree.init_tree( + tree_id=i, + leaf_node_value=initial_value_leaf_nodes, + idx_data_points=initial_idx_data_points_leaf_nodes, + ) + list_of_trees.append(new_tree) + # Diff trick to speed computation of residuals. + # Taken from Section 3.1 of Kapelner, A and Bleich, J. bartMachine: A Powerful Tool for Machine Learning in R. ArXiv e-prints, 2013 + # The sum_trees_output will contain the sum of the predicted output for all trees. + # When R_j is needed we subtract the current predicted output for tree T_j. + self.sum_trees_output = np.ones_like(self.Y) * self.Y.mean() + + return list_of_trees + + def __iter__(self): + return iter(self.trees) + + def __repr_latex(self): + raise NotImplementedError + + def prediction_untransformed(self, x): + sum_of_trees = 0.0 + for t in self.trees: + sum_of_trees += t.out_of_sample_predict(x=x) + return sum_of_trees + + def sample_dist_splitting_variable(self, value): + return self._disc_uniform_dist_sampler.sample(0, value) + + def sample_dist_splitting_rule_assignment(self, value): + return self._disc_uniform_dist_sampler.sample(0, value) + + def get_available_predictors(self, idx_data_points_split_node): + possible_splitting_variables = [] + for j in range(self.number_variates): + x_j = self.X[idx_data_points_split_node, j] + x_j = x_j[~np.isnan(x_j)] + for i in range(1, len(x_j)): + if x_j[i - 1] != x_j[i]: + possible_splitting_variables.append(j) + break + return possible_splitting_variables + + def get_available_splitting_rules(self, idx_data_points_split_node, idx_split_variable): + x_j = self.X[idx_data_points_split_node, idx_split_variable] + x_j = x_j[~np.isnan(x_j)] + values, indices = np.unique(x_j, return_index=True) + # The last value is not consider since if we choose it as the value of + # the splitting rule assignment, it would leave the right subtree empty. + return values[:-1], indices[:-1] + + def grow_tree(self, tree, index_leaf_node): + # This can be unsuccessful when there are not available predictors + successful_grow_tree = False + current_node = tree.get_node(index_leaf_node) + + available_predictors = self.get_available_predictors(current_node.idx_data_points) + + if not available_predictors: + return successful_grow_tree + + index_selected_predictor = self.sample_dist_splitting_variable(len(available_predictors)) + selected_predictor = available_predictors[index_selected_predictor] + + available_splitting_rules, _ = self.get_available_splitting_rules( + current_node.idx_data_points, selected_predictor + ) + index_selected_splitting_rule = self.sample_dist_splitting_rule_assignment( + len(available_splitting_rules) + ) + selected_splitting_rule = available_splitting_rules[index_selected_splitting_rule] + + new_split_node = SplitNode( + index=index_leaf_node, + idx_split_variable=selected_predictor, + split_value=selected_splitting_rule, + ) + + left_node_idx_data_points, right_node_idx_data_points = self.get_new_idx_data_points( + new_split_node, current_node.idx_data_points + ) + + left_node_value = self.draw_leaf_value(tree, left_node_idx_data_points) + right_node_value = self.draw_leaf_value(tree, right_node_idx_data_points) + + new_left_node = LeafNode( + index=current_node.get_idx_left_child(), + value=left_node_value, + idx_data_points=left_node_idx_data_points, + ) + new_right_node = LeafNode( + index=current_node.get_idx_right_child(), + value=right_node_value, + idx_data_points=right_node_idx_data_points, + ) + tree.grow_tree(index_leaf_node, new_split_node, new_left_node, new_right_node) + successful_grow_tree = True + + return successful_grow_tree + + def get_new_idx_data_points(self, current_split_node, idx_data_points): + idx_split_variable = current_split_node.idx_split_variable + split_value = current_split_node.split_value + + left_idx = np.nonzero(self.X[idx_data_points, idx_split_variable] <= split_value) + left_node_idx_data_points = idx_data_points[left_idx] + right_idx = np.nonzero(~(self.X[idx_data_points, idx_split_variable] <= split_value)) + right_node_idx_data_points = idx_data_points[right_idx] + + return left_node_idx_data_points, right_node_idx_data_points + + def get_residuals(self, tree): + R_j = self.Y - (self.sum_trees_output - tree.predict_output(self.num_observations)) + return R_j + + def draw_leaf_value(self, tree, idx_data_points): + raise NotImplementedError + + +class DiscreteUniformDistributionSampler: + """ + Draw samples from a discrete uniform distribution. + Samples are uniformly distributed over the half-open interval [low, high) (includes low, but excludes high). + """ + + def __init__(self, cache_size=1000): + self._cache_size = cache_size + self._cache = [] + + def sample(self, lower_limit, upper_limit): + if len(self._cache) == 0: + self.refresh_cache() + return int(lower_limit + (upper_limit - lower_limit) * self._cache.pop()) + + def refresh_cache(self): + self._cache = list(np.random.random(size=self._cache_size)) + + +class NormalDistributionSampler: + def __init__(self, cache_size=1000): + self._cache_size = cache_size + self._cache = [] + + def sample(self): + if len(self._cache) == 0: + self.refresh_cache() + return self._cache.pop() + + def refresh_cache(self): + self._cache = list(np.random.normal(size=self._cache_size)) + + +class BART(BaseBART): + def __init__(self, X, Y, m=200, alpha=0.95, beta=2.0): + super().__init__(X, Y, m, alpha, beta) + + def _repr_latex_(self, name=None, dist=None): + if dist is None: + dist = self + X = (type(self.X),) + Y = (type(self.Y),) + m = (self.m,) + alpha = self.alpha + beta = self.beta + m = self.m + name = r"\text{%s}" % name + return r"""${} \sim \text{{BART}}(\mathit{{alpha}}={},~\mathit{{beta}}={},~\mathit{{m}}={})$""".format( + name, alpha, beta, m + ) + + def draw_leaf_value(self, tree, idx_data_points): + R_j = self.get_residuals(tree) + node_responses = R_j[idx_data_points] + + data_mean = ( + node_responses.mean() / self.m + ) # for skewed distribution use median or sample from data-points? + data_std_scaled = node_responses.std() / self.m + + draw = data_mean # (data_mean + self._normal_dist_sampler.sample()) * data_std_scaled + + return draw diff --git a/pymc3/distributions/tree.py b/pymc3/distributions/tree.py new file mode 100644 index 00000000000..8cc6d188227 --- /dev/null +++ b/pymc3/distributions/tree.py @@ -0,0 +1,508 @@ +import numpy as np +import math +from copy import deepcopy + + +class TreeStructureError(Exception): + """Base (catch-all) tree structure exception.""" + + +class TreeNodeError(Exception): + """Base (catch-all) tree node exception.""" + + +class Tree: + """ Full binary tree + A full binary tree is a tree where each node has exactly zero or two children. + This structure is used as the basic component of the Bayesian Additive Regression Tree (BART) + Attributes + ---------- + tree_structure : dict + A dictionary that represents the nodes stored in breadth-first order, based in the array method + for storing binary trees (https://en.wikipedia.org/wiki/Binary_tree#Arrays). + The dictionary's keys are integers that represent the nodes position. + The dictionary's values are objects of type SplitNode or LeafNode that represent the nodes of the tree itself. + num_nodes : int + Total number of nodes. + idx_leaf_nodes : list + List with the index of the leaf nodes of the tree. + idx_prunable_split_nodes : list + List with the index of the prunable splitting nodes of the tree. A splitting node is prunable if both + its children are leaf nodes. + tree_id : int + Identifier used to get the previous tree in the ParticleGibbs algorithm used in BART. + + Parameters + ---------- + tree_id : int, optional + """ + + def __init__(self, tree_id=0): + self.tree_structure = {} + self.num_nodes = 0 + self.idx_leaf_nodes = [] + self.idx_prunable_split_nodes = [] + self.tree_id = tree_id + + def __getitem__(self, index): + return self.get_node(index) + + def __setitem__(self, index, node): + self.set_node(index, node) + + def __delitem__(self, index): + self.delete_node(index) + + def __iter__(self): + return iter(self.tree_structure.values()) + + def __eq__(self, other): + # The idx_leaf_nodes and idx_prunable_split_nodes are transformed to sets to correctly check for equivalence + # in case the values are not ordered in the same way. + return ( + self.tree_structure == other.tree_structure + and self.num_nodes == other.num_nodes + and set(self.idx_leaf_nodes) == set(other.idx_leaf_nodes) + and set(self.idx_prunable_split_nodes) == set(other.idx_prunable_split_nodes) + and self.tree_id == other.tree_id + ) + + def __hash__(self): + # Method added to create a set of trees. + return 0 + + def __len__(self): + return len(self.tree_structure) + + def __repr__(self): + return "Tree(num_nodes={})".format(self.num_nodes) + + def __str__(self): + lines = self._build_tree_string(index=0, show_index=False, delimiter="-")[0] + return "\n" + "\n".join((line.rstrip() for line in lines)) + + def _build_tree_string(self, index, show_index=False, delimiter="-"): + """Recursively walk down the binary tree and build a pretty-print string. + + In each recursive call, a "box" of characters visually representing the + current (sub)tree is constructed line by line. Each line is padded with + whitespaces to ensure all lines in the box have the same length. Then the + box, its width, and start-end positions of its root node value repr string + (required for drawing branches) are sent up to the parent call. The parent + call then combines its left and right sub-boxes to build a larger box etc. + """ + if index not in self.tree_structure: + return [], 0, 0, 0 + + line1 = [] + line2 = [] + current_node = self.get_node(index) + if show_index: + node_repr = "{}{}{}".format(index, delimiter, str(current_node)) + else: + node_repr = str(current_node) + + new_root_width = gap_size = len(node_repr) + + left_child = current_node.get_idx_left_child() + right_child = current_node.get_idx_right_child() + + # Get the left and right sub-boxes, their widths, and root repr positions + l_box, l_box_width, l_root_start, l_root_end = self._build_tree_string( + left_child, show_index, delimiter + ) + r_box, r_box_width, r_root_start, r_root_end = self._build_tree_string( + right_child, show_index, delimiter + ) + + # Draw the branch connecting the current root node to the left sub-box + # Pad the line with whitespaces where necessary + if l_box_width > 0: + l_root = (l_root_start + l_root_end) // 2 + 1 + line1.append(" " * (l_root + 1)) + line1.append("_" * (l_box_width - l_root)) + line2.append(" " * l_root + "/") + line2.append(" " * (l_box_width - l_root)) + new_root_start = l_box_width + 1 + gap_size += 1 + else: + new_root_start = 0 + + # Draw the representation of the current root node + line1.append(node_repr) + line2.append(" " * new_root_width) + + # Draw the branch connecting the current root node to the right sub-box + # Pad the line with whitespaces where necessary + if r_box_width > 0: + r_root = (r_root_start + r_root_end) // 2 + line1.append("_" * r_root) + line1.append(" " * (r_box_width - r_root + 1)) + line2.append(" " * r_root + "\\") + line2.append(" " * (r_box_width - r_root)) + gap_size += 1 + new_root_end = new_root_start + new_root_width - 1 + + # Combine the left and right sub-boxes with the branches drawn above + gap = " " * gap_size + new_box = ["".join(line1), "".join(line2)] + for i in range(max(len(l_box), len(r_box))): + l_line = l_box[i] if i < len(l_box) else " " * l_box_width + r_line = r_box[i] if i < len(r_box) else " " * r_box_width + new_box.append(l_line + gap + r_line) + + # Return the new box, its width and its root repr positions + return new_box, len(new_box[0]), new_root_start, new_root_end + + def copy(self): + return deepcopy(self) + + def get_node(self, index): + if not isinstance(index, int) or index < 0: + raise TreeStructureError("Node index must be a non-negative int") + if index not in self.tree_structure: + raise TreeStructureError("Node missing at index {}".format(index)) + return self.tree_structure[index] + + def set_node(self, index, node): + if not isinstance(index, int) or index < 0: + raise TreeStructureError("Node index must be a non-negative int") + if not isinstance(node, SplitNode) and not isinstance(node, LeafNode): + raise TreeStructureError("Node class must be SplitNode or LeafNode") + if index in self.tree_structure: + raise TreeStructureError("Node index already exist in tree") + if self.num_nodes == 0 and index != 0: + raise TreeStructureError("Root node must have index zero") + parent_index = node.get_idx_parent_node() + if self.num_nodes != 0 and parent_index not in self.tree_structure: + raise TreeStructureError("Invalid index, node must have a parent node") + if self.num_nodes != 0 and not isinstance(self.get_node(parent_index), SplitNode): + raise TreeStructureError("Parent node must be of class SplitNode") + if index != node.index: + raise TreeStructureError("Node must have same index as tree index") + self.tree_structure[index] = node + self.num_nodes += 1 + if isinstance(node, LeafNode): + self.idx_leaf_nodes.append(index) + + def delete_node(self, index): + if not isinstance(index, int) or index < 0: + raise TreeStructureError("Node index must be a non-negative int") + if index not in self.tree_structure: + raise TreeStructureError("Node missing at index {}".format(index)) + current_node = self.get_node(index) + left_child_idx = current_node.get_idx_left_child() + right_child_idx = current_node.get_idx_right_child() + if left_child_idx in self.tree_structure or right_child_idx in self.tree_structure: + raise TreeStructureError("Invalid removal of node, leaving two orphans nodes") + if isinstance(current_node, LeafNode): + self.idx_leaf_nodes.remove(index) + del self.tree_structure[index] + self.num_nodes -= 1 + + def make_digraph(self, name="Tree"): + """Make graphviz Digraph of the tree. + + Parameters + ---------- + name : str + Name used for the Digraph. Useful to differentiate digraph of trees. + + Returns + ------- + graphviz.Digraph + """ + try: + import graphviz + except ImportError: + raise ImportError( + "This function requires the python library graphviz, along with binaries. " + "The easiest way to install all of this is by running\n\n" + "\tconda install -c conda-forge python-graphviz" + ) + graph = graphviz.Digraph(name) + graph = self._digraph_tree_traversal(0, graph) + return graph + + def _digraph_tree_traversal(self, index, graph): + if index not in self.tree_structure.keys(): + return graph + current_node = self.get_node(index) + style = "" + if isinstance(current_node, SplitNode): + shape = "box" + if index in self.idx_prunable_split_nodes: + style = "filled" + else: + shape = "ellipse" + node_name = "{}_{}".format(graph.name, index) + graph.node(name=node_name, label=str(current_node), shape=shape, style=style) + + parent_index = current_node.get_idx_parent_node() + if parent_index in self.tree_structure: + tail_name = "{}_{}".format(graph.name, parent_index) + head_name = "{}_{}".format(graph.name, index) + if current_node.is_left_child(): + graph.edge(tail_name=tail_name, head_name=head_name, label="T") + else: + graph.edge(tail_name=tail_name, head_name=head_name, label="F") + + left_child = current_node.get_idx_left_child() + right_child = current_node.get_idx_right_child() + graph = self._digraph_tree_traversal(left_child, graph) + graph = self._digraph_tree_traversal(right_child, graph) + + return graph + + def predict_output(self, num_observations): + output = np.zeros(num_observations) + for node_index in self.idx_leaf_nodes: + current_node = self.get_node(node_index) + output[current_node.idx_data_points] = current_node.value + return output + + def out_of_sample_predict(self, x): + """ + Predict output of tree for an unobserved point. + + Parameters + ---------- + x : np.ndarray + + Returns + ------- + float + Value of the leaf value where the unobserved point lies. + """ + leaf_node = self._traverse_tree(x=x, node_index=0) + return leaf_node.value + + def _traverse_tree(self, x, node_index=0): + """ + Traverse the tree starting from a particular node given an unobserved point. + + Parameters + ---------- + x : np.ndarray + node_index : int + + Returns + ------- + LeafNode + """ + current_node = self.get_node(node_index) + if isinstance(current_node, SplitNode): + if current_node.evaluate_splitting_rule(x): + left_child = current_node.get_idx_left_child() + final_node = self._traverse_tree(x, left_child) + else: + right_child = current_node.get_idx_right_child() + final_node = self._traverse_tree(x, right_child) + else: + final_node = current_node + return final_node + + def grow_tree(self, index_leaf_node, new_split_node, new_left_node, new_right_node): + """ + Grow the tree from a particular node. + + Parameters + ---------- + index_leaf_node : int + new_split_node : SplitNode + new_left_node : LeafNode + new_right_node : LeafNode + """ + current_node = self.get_node(index_leaf_node) + if not isinstance(current_node, LeafNode): + raise TreeStructureError("The tree grows from the leaves") + if not isinstance(new_split_node, SplitNode): + raise TreeStructureError("The node that replaces the leaf node must be SplitNode") + if not isinstance(new_left_node, LeafNode) or not isinstance(new_right_node, LeafNode): + raise TreeStructureError("The new leaves must be LeafNode") + + self.delete_node(index_leaf_node) + self.set_node(index_leaf_node, new_split_node) + self.set_node(new_left_node.index, new_left_node) + self.set_node(new_right_node.index, new_right_node) + + # The new SplitNode is a prunable node since it has both children. + self.idx_prunable_split_nodes.append(index_leaf_node) + # If the parent of the node from which the tree is growing was a prunable node, + # remove from the list since one of its children is a SplitNode now + parent_index = current_node.get_idx_parent_node() + if parent_index in self.idx_prunable_split_nodes: + self.idx_prunable_split_nodes.remove(parent_index) + + @staticmethod + def init_tree(tree_id, leaf_node_value, idx_data_points): + """ + + Parameters + ---------- + tree_id + leaf_node_value + idx_data_points + + Returns + ------- + + """ + new_tree = Tree(tree_id) + new_tree[0] = LeafNode(index=0, value=leaf_node_value, idx_data_points=idx_data_points) + return new_tree + + def get_tree_depth(self): + """ + + Returns + ------- + + """ + max_depth = 0 + for idx_leaf_node in self.idx_leaf_nodes: + leaf_node = self.get_node(idx_leaf_node) + if max_depth < leaf_node.depth: + max_depth = leaf_node.depth + return max_depth + + +class BaseNode: + def __init__(self, index): + if not isinstance(index, int) or index < 0: + raise TreeNodeError("Node index must be a non-negative int") + self.index = index + self.depth = int(math.floor(math.log(index + 1, 2))) + + def __eq__(self, other): + return self.index == other.index and self.depth == other.depth + + def get_idx_parent_node(self): + return (self.index - 1) // 2 + + def get_idx_left_child(self): + return self.index * 2 + 1 + + def get_idx_right_child(self): + return self.get_idx_left_child() + 1 + + def is_left_child(self): + return bool(self.index % 2) + + def get_idx_sibling(self): + return (self.index + 1) if self.is_left_child() else (self.index - 1) + + +class SplitNode(BaseNode): + def __init__(self, index, idx_split_variable, split_value): + super().__init__(index) + + if not isinstance(idx_split_variable, int) or idx_split_variable < 0: + raise TreeNodeError("Index of split variable must be a non-negative int") + if not isinstance(split_value, float): + raise TreeNodeError("Node split value type must be float") + + self.idx_split_variable = idx_split_variable + self.split_value = split_value + + def __repr__(self): + return "SplitNode(index={}, idx_split_variable={}, split_value={})".format( + self.index, self.idx_split_variable, self.split_value + ) + + def __str__(self): + return "x[{}] <= {}".format(self.idx_split_variable, self.split_value) + + def __eq__(self, other): + if isinstance(other, SplitNode): + return ( + super().__eq__(other) + and self.idx_split_variable == other.idx_split_variable + and self.split_value == other.split_value + ) + else: + return NotImplemented + + def evaluate_splitting_rule(self, x): + if x is np.NaN: + return False + else: + return x[self.idx_split_variable] <= self.split_value + + def prior_log_probability_node(self, alpha, beta): + """ + Calculate the log probability of the node being a SplitNode. + Taken from equation 7 in [Chipman2010]. + + Parameters + ---------- + alpha : float + beta : float + + Returns + ------- + float + + References + ---------- + .. [Chipman2010] Chipman, H. A., George, E. I., & McCulloch, R. E. (2010). BART: Bayesian + additive regression trees. The Annals of Applied Statistics, 4(1), 266-298., + `link `__ + """ + return np.log(alpha * np.power(1.0 + self.depth, -beta)) + + +class LeafNode(BaseNode): + def __init__(self, index, value, idx_data_points): + super().__init__(index) + if not isinstance(value, float): + raise TreeNodeError("Leaf node value type must be float") + if ( + not isinstance(idx_data_points, np.ndarray) + or idx_data_points.dtype.type is not np.int32 + ): + raise TreeNodeError("Index of data points must be a numpy.ndarray of np.int32") + if len(idx_data_points) == 0: + raise TreeNodeError("Index of data points can not be empty") + self.value = value + self.idx_data_points = idx_data_points + + def __repr__(self): + return "LeafNode(index={}, value={}, len(idx_data_points)={})".format( + self.index, self.value, len(self.idx_data_points) + ) + + def __str__(self): + return "{}".format(self.value) + + def __eq__(self, other): + if isinstance(other, LeafNode): + return ( + super().__eq__(other) + and self.value == other.value + and np.array_equal(self.idx_data_points, other.idx_data_points) + ) + else: + return NotImplemented + + def prior_log_probability_node(self, alpha, beta): + """ + Calculate the log probability of the node being a LeafNode. + Taken from equation 7 in [Chipman2010]. + + Parameters + ---------- + alpha : float + beta : float + + Returns + ------- + float + + References + ---------- + .. [Chipman2010] Chipman, H. A., George, E. I., & McCulloch, R. E. (2010). BART: Bayesian + additive regression trees. The Annals of Applied Statistics, 4(1), 266-298., + `link `__ + """ + return np.log(1.0 - alpha * np.power(1.0 + self.depth, -beta)) diff --git a/pymc3/model.py b/pymc3/model.py index ca18a3278d0..0b9f2e810b5 100644 --- a/pymc3/model.py +++ b/pymc3/model.py @@ -1144,11 +1144,7 @@ def Var(self, name, dist, data=None, total_size=None, dims=None): elif isinstance(data, dict): with self: var = MultiObservedRV( - name=name, - data=data, - distribution=dist, - total_size=total_size, - model=self, + name=name, data=data, distribution=dist, total_size=total_size, model=self, ) self.observed_RVs.append(var) if var.missing_values: @@ -1159,11 +1155,7 @@ def Var(self, name, dist, data=None, total_size=None, dims=None): else: with self: var = ObservedRV( - name=name, - data=data, - distribution=dist, - total_size=total_size, - model=self, + name=name, data=data, distribution=dist, total_size=total_size, model=self, ) self.observed_RVs.append(var) if var.missing_values: @@ -1662,10 +1654,7 @@ def __init__( self.scaling = _get_scaling(total_size, self.shape, self.ndim) incorporate_methods( - source=distribution, - destination=self, - methods=["random"], - wrapper=InstanceMethod, + source=distribution, destination=self, methods=["random"], wrapper=InstanceMethod, ) @property @@ -1721,10 +1710,7 @@ def as_tensor(data, name, model, distribution): testval = np.broadcast_to(distribution.default(), data.shape)[data.mask] fakedist = NoDistribution.dist( - shape=data.mask.sum(), - dtype=dtype, - testval=testval, - parent_dist=distribution, + shape=data.mask.sum(), dtype=dtype, testval=testval, parent_dist=distribution, ) missing_values = FreeRV(name=name + "_missing", distribution=fakedist, model=model) constant = tt.as_tensor_variable(data.filled()) @@ -1977,10 +1963,7 @@ def __init__( self.tag.test_value = normalRV.tag.test_value self.scaling = _get_scaling(total_size, self.shape, self.ndim) incorporate_methods( - source=distribution, - destination=self, - methods=["random"], - wrapper=InstanceMethod, + source=distribution, destination=self, methods=["random"], wrapper=InstanceMethod, ) @property @@ -1997,10 +1980,12 @@ def as_iterargs(data): def all_continuous(vars): - """Check that vars not include discrete variables, excepting - ObservedRVs.""" + """Check that vars not include discrete variables or BART variables, excepting ObservedRVs.""" + vars_ = [var for var in vars if not isinstance(var, pm.model.ObservedRV)] - if any([var.dtype in pm.discrete_types for var in vars_]): + if any( + [(var.dtype in pm.discrete_types or isinstance(var.distribution, pm.BART)) for var in vars_] + ): return False else: return True diff --git a/pymc3/sampling.py b/pymc3/sampling.py index f577a5aa8e5..65126c28911 100644 --- a/pymc3/sampling.py +++ b/pymc3/sampling.py @@ -51,6 +51,7 @@ Slice, CompoundStep, arraystep, + PGBART, ) from .util import ( update_start_vals, @@ -90,6 +91,7 @@ BinaryGibbsMetropolis, Slice, CategoricalGibbsMetropolis, + PGBART, ) ArrayLike = Union[np.ndarray, List[float]] @@ -657,14 +659,7 @@ def _check_start_shape(model, start): def _sample_many( - draws, - chain: int, - chains: int, - start: list, - random_seed: list, - step, - callback=None, - **kwargs, + draws, chain: int, chains: int, start: list, random_seed: list, step, callback=None, **kwargs, ): """Samples all chains sequentially. @@ -1001,8 +996,7 @@ def _iter_sample( if callback is not None: warns = getattr(step, "warnings", None) callback( - trace=strace, - draw=Draw(chain, i == draws, i, i < tune, stats, point, warns), + trace=strace, draw=Draw(chain, i == draws, i, i < tune, stats, point, warns), ) yield strace, diverging @@ -1955,13 +1949,7 @@ def sample_prior_predictive( def init_nuts( - init="auto", - chains=1, - n_init=500000, - model=None, - random_seed=None, - progressbar=True, - **kwargs, + init="auto", chains=1, n_init=500000, model=None, random_seed=None, progressbar=True, **kwargs, ): """Set up the mass matrix initialization for NUTS. diff --git a/pymc3/step_methods/__init__.py b/pymc3/step_methods/__init__.py index 5755e99eb51..f2cd57e60c9 100644 --- a/pymc3/step_methods/__init__.py +++ b/pymc3/step_methods/__init__.py @@ -33,3 +33,5 @@ from .slicer import Slice from .elliptical_slice import EllipticalSlice + +from .pgbart import PGBART diff --git a/pymc3/step_methods/hmc/nuts.py b/pymc3/step_methods/hmc/nuts.py index 782a350654c..5d3bcc52870 100644 --- a/pymc3/step_methods/hmc/nuts.py +++ b/pymc3/step_methods/hmc/nuts.py @@ -23,9 +23,9 @@ from pymc3.math import logbern, logdiffexp_numpy from pymc3.theanof import floatX from pymc3.vartypes import continuous_types +from ...distributions import BART - -__all__ = ["NUTS"] +__all__ = ['NUTS'] class NUTS(BaseHMC): @@ -196,7 +196,7 @@ def _hamiltonian_step(self, start, p0, step_size): @staticmethod def competence(var, has_grad): """Check how appropriate this class is for sampling a random variable.""" - if var.dtype in continuous_types and has_grad: + if var.dtype in continuous_types and has_grad and not isinstance(var.distribution, BART): return Competence.IDEAL return Competence.INCOMPATIBLE diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py new file mode 100644 index 00000000000..f72aedb5160 --- /dev/null +++ b/pymc3/step_methods/pgbart.py @@ -0,0 +1,214 @@ +import numpy as np + +from .arraystep import ArrayStepShared, Competence +from ..distributions import BART +from ..distributions.tree import Tree +from ..model import modelcontext +from ..theanof import inputvars, make_shared_replacements, floatX +from ..smc.smc_utils import logp_forw + + +class PGBART(ArrayStepShared): + """ + """ + + name = "bartsampler" + default_blocked = False + + def __init__(self, vars=None, m=2, num_particles=10, max_stages=5000, model=None): + model = modelcontext(model) + vars = inputvars(vars) + self.bart = vars[0].distribution + + self.num_particles = num_particles + self.max_stages = max_stages + self.previous_trees_particles_list = [] + for i in range(self.bart.m): + p = Particle(self.bart.trees[i]) + self.previous_trees_particles_list.append(p) + + shared = make_shared_replacements(vars, model) + self.likelihood_logp = logp_forw([model.datalogpt], vars, shared) + super().__init__(vars, shared) + + def astep(self, q_0): + # Step 4 of algorithm + bart = self.bart + num_observations = bart.num_observations + likelihood_logp = self.likelihood_logp + output = np.zeros((bart.m, num_observations)) + log_num_particles = np.log(self.num_particles) + for idx, tree in enumerate(bart.trees): + R_j = bart.get_residuals(tree) + bart.Y_shared.set_value(R_j) + list_of_particles = self.init_particles(tree.tree_id, R_j, bart.num_observations) + # Step 5 of algorithm + old_likelihoods = np.array( + [ + likelihood_logp(p.tree.predict_output(num_observations)) + for p in list_of_particles + ] + ) + log_weights = np.array(old_likelihoods) + + list_of_particles[0] = self.get_previous_tree_particle(tree.tree_id, 0) + log_weights[0] = likelihood_logp( + list_of_particles[0].tree.predict_output(num_observations) + ) + old_likelihoods[0] = log_weights[0] + log_weights -= log_num_particles + + for p_idx, p in enumerate(list_of_particles): + p.weight = log_weights[p_idx] + + for t in range(1, self.max_stages + 2): + # Step 7 of algorithm + list_of_particles[0] = self.get_previous_tree_particle(tree.tree_id, t) + for c in range( + 1, self.num_particles + ): # This should be embarrassingly parallelizable + # Step 9 of algorithm + list_of_particles[c].sample_tree_sequential(bart) + # for c in range(self.num_particles): + # list_of_particles[c].update_weight() + + # Step 12 of algorithm + for p_idx, p in enumerate(list_of_particles): + new_likelihood = likelihood_logp(p.tree.predict_output(num_observations)) + # p.weight is actually log_weight + # print(new_likelihood - old_likelihoods[p_idx]) + p.weight += new_likelihood - old_likelihoods[p_idx] + old_likelihoods[p_idx] = new_likelihood + log_weights[p_idx] = p.weight + + W, normalized_weights = self._normalize(log_weights) + + # Step 15 of algorithm + # list_of_particles = self.resample(list_of_particles, normalized_weights) + # resample all but first particle + re_n_w = normalized_weights[1:] / normalized_weights[1:].sum() + indices = range(1, len(list_of_particles)) + new_indices = np.random.choice(indices, size=len(list_of_particles) - 1, p=re_n_w) + list_of_particles[1:] = np.array(list_of_particles)[new_indices] + old_likelihoods[1:] = old_likelihoods[new_indices] + + # Step 16 of algorithm + w_t = W - log_num_particles + for c in range(self.num_particles): + list_of_particles[c].weight = w_t + + # ### XXX Revisar esto!!!! + # # Step 17 of algorithm + # non_available_nodes_for_expansion = np.ones(self.num_particles) + # for c in range(self.num_particles): + # if len(list_of_particles[c].expansion_nodes) != 0: + # non_available_nodes_for_expansion[c] = 0 + # if np.all(non_available_nodes_for_expansion): + # break + non_available_nodes_for_expansion = True + for c in range(self.num_particles): + if len(list_of_particles[c].expansion_nodes) != 0: + non_available_nodes_for_expansion = False + break + if non_available_nodes_for_expansion: + break + + # print(idx, t, normalized_weights) + new_tree = np.random.choice(list_of_particles, p=normalized_weights) + self.previous_trees_particles_list[tree.tree_id] = new_tree + bart.trees[idx] = new_tree.tree + new_prediction = new_tree.tree.predict_output(num_observations) + output[idx] = new_prediction + bart.sum_trees_output = bart.Y - R_j + new_prediction + + return np.sum(output, axis=0) + + @staticmethod + def competence(var, has_grad): + """ + PGBART is only suitable for BART distributions + """ + if isinstance(var.distribution, BART): + return Competence.IDEAL + return Competence.INCOMPATIBLE + + def _normalize(self, log_w): # this function needs a home sweet home + """ + use logsumexp trick to get W and softmax to get normalized_weights + """ + log_w_max = log_w.max() + log_w_ = log_w - log_w_max + w_ = np.exp(log_w_) + w_sum = w_.sum() + W = log_w_max + np.log(w_sum) + normalized_weights = w_ / w_sum + # stabilize weights to avoid assigning exactly zero probability to a particle + normalized_weights += 1e-12 + + return W, normalized_weights + + def get_previous_tree_particle(self, tree_id, t): + previous_tree_particle = self.previous_trees_particles_list[tree_id] + previous_tree_particle.set_particle_to_step(t) + return previous_tree_particle + + def init_particles(self, tree_id, R_j, num_observations): + list_of_particles = [] + initial_value_leaf_nodes = R_j.mean() + initial_idx_data_points_leaf_nodes = np.array(range(num_observations), dtype="int32") + new_tree = Tree.init_tree( + tree_id=tree_id, + leaf_node_value=initial_value_leaf_nodes, + idx_data_points=initial_idx_data_points_leaf_nodes, + ) + for _ in range(self.num_particles): + new_particle = Particle(new_tree) + list_of_particles.append(new_particle) + return list_of_particles + + def resample(self, list_of_particles, normalized_weights): + list_of_particles = np.random.choice( + list_of_particles, size=len(list_of_particles), p=normalized_weights + ) + return list_of_particles + + +class Particle: + def __init__(self, tree): + self.tree = tree.copy() # Mantiene el arbol que nos interesa en este momento + self.expansion_nodes = self.tree.idx_leaf_nodes.copy() # This should be the array [0] + self.tree_history = [self.tree.copy()] + self.expansion_nodes_history = [self.expansion_nodes.copy()] + self.weight = 0.0 + + def sample_tree_sequential(self, bart): + if self.expansion_nodes: + index_leaf_node = self.expansion_nodes.pop(0) + # Probability that this node will remain a leaf node + log_prob = self.tree[index_leaf_node].prior_log_probability_node(bart.alpha, bart.beta) + + if np.exp(log_prob) < np.random.random(): + self.grow_successful = bart.grow_tree(self.tree, index_leaf_node) + # TODO: in case the grow_tree fails, should we try to sample the tree from another leaf node? + if self.grow_successful: + # Add new leaf nodes indexes + new_indexes = self.tree.idx_leaf_nodes[-2:] + self.expansion_nodes.extend(new_indexes) + self.tree_history.append(self.tree.copy()) + self.expansion_nodes_history.append(self.expansion_nodes.copy()) + + def set_particle_to_step(self, t): + if len(self.tree_history) <= t: + self.tree = self.tree_history[-1] + self.expansion_nodes = self.expansion_nodes_history[-1] + else: + self.tree = self.tree_history[t] + self.expansion_nodes = self.expansion_nodes_history[t] + + def init_weight(self): + # TODO + return 1.0 + + def update_weight(self): + # TODO + pass From 9d4f73fc4aa9acd67896897032700f97bc4f8348 Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Sat, 25 Jan 2020 09:46:09 -0300 Subject: [PATCH 02/24] black --- pymc3/step_methods/hmc/nuts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pymc3/step_methods/hmc/nuts.py b/pymc3/step_methods/hmc/nuts.py index 5d3bcc52870..5810470f300 100644 --- a/pymc3/step_methods/hmc/nuts.py +++ b/pymc3/step_methods/hmc/nuts.py @@ -196,7 +196,7 @@ def _hamiltonian_step(self, start, p0, step_size): @staticmethod def competence(var, has_grad): """Check how appropriate this class is for sampling a random variable.""" - if var.dtype in continuous_types and has_grad and not isinstance(var.distribution, BART): + if var.dtype in continuous_types and has_grad and not isinstance(var.distribution, BART): return Competence.IDEAL return Competence.INCOMPATIBLE From beaf184d5663d02a1d6a312d2b32725392cb42c9 Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Mon, 27 Jan 2020 13:15:24 -0300 Subject: [PATCH 03/24] minor fix --- pymc3/distributions/bart.py | 6 ++---- pymc3/smc/smc.py | 1 + pymc3/step_methods/pgbart.py | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py index b48e3d5b165..08f61bf1d3f 100644 --- a/pymc3/distributions/bart.py +++ b/pymc3/distributions/bart.py @@ -253,11 +253,9 @@ def draw_leaf_value(self, tree, idx_data_points): R_j = self.get_residuals(tree) node_responses = R_j[idx_data_points] - data_mean = ( - node_responses.mean() / self.m - ) # for skewed distribution use median or sample from data-points? + data_mean = node_responses.mean() # for skewed distribution use median or sample from data-points? data_std_scaled = node_responses.std() / self.m - draw = data_mean # (data_mean + self._normal_dist_sampler.sample()) * data_std_scaled + draw = (data_mean + self._normal_dist_sampler.sample() * data_std_scaled) return draw diff --git a/pymc3/smc/smc.py b/pymc3/smc/smc.py index d6069122952..69b94054da0 100644 --- a/pymc3/smc/smc.py +++ b/pymc3/smc/smc.py @@ -392,3 +392,4 @@ def __call__(self, posterior): if self.save_log_pseudolikelihood: self.save_lpl(elemwise) return elemwise.sum() + diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py index f72aedb5160..30d422003c9 100644 --- a/pymc3/step_methods/pgbart.py +++ b/pymc3/step_methods/pgbart.py @@ -5,7 +5,7 @@ from ..distributions.tree import Tree from ..model import modelcontext from ..theanof import inputvars, make_shared_replacements, floatX -from ..smc.smc_utils import logp_forw +from ..smc.smc import logp_forw class PGBART(ArrayStepShared): From f11a57f6e4aabe131c52f142af57c55089666300 Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Thu, 30 Jan 2020 13:08:12 -0300 Subject: [PATCH 04/24] clean code --- pymc3/step_methods/pgbart.py | 62 ++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 27 deletions(-) diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py index 30d422003c9..2f60325c5f5 100644 --- a/pymc3/step_methods/pgbart.py +++ b/pymc3/step_methods/pgbart.py @@ -10,12 +10,26 @@ class PGBART(ArrayStepShared): """ + Write me!!! + + References + ---------- + .. [Lakshminarayanan2015] Lakshminarayanan, B. and Roy, D.M. and Teh, Y. W., (2015), + Particle Gibbs for Bayesian Additive Regression Trees. + ArviX, + `link `__ + + Parameters + ---------- + lower: float + Lower limit. + upper: float + Upper limit. """ - name = "bartsampler" default_blocked = False - def __init__(self, vars=None, m=2, num_particles=10, max_stages=5000, model=None): + def __init__(self, vars=None, num_particles=10, max_stages=5000, model=None): model = modelcontext(model) vars = inputvars(vars) self.bart = vars[0].distribution @@ -59,32 +73,26 @@ def astep(self, q_0): log_weights -= log_num_particles for p_idx, p in enumerate(list_of_particles): - p.weight = log_weights[p_idx] + p.log_weight = log_weights[p_idx] for t in range(1, self.max_stages + 2): # Step 7 of algorithm list_of_particles[0] = self.get_previous_tree_particle(tree.tree_id, t) - for c in range( - 1, self.num_particles - ): # This should be embarrassingly parallelizable + # This should be embarrassingly parallelizable + for c in range(1, self.num_particles): # Step 9 of algorithm list_of_particles[c].sample_tree_sequential(bart) - # for c in range(self.num_particles): - # list_of_particles[c].update_weight() # Step 12 of algorithm for p_idx, p in enumerate(list_of_particles): new_likelihood = likelihood_logp(p.tree.predict_output(num_observations)) - # p.weight is actually log_weight - # print(new_likelihood - old_likelihoods[p_idx]) - p.weight += new_likelihood - old_likelihoods[p_idx] + p.log_weight += new_likelihood - old_likelihoods[p_idx] old_likelihoods[p_idx] = new_likelihood - log_weights[p_idx] = p.weight + log_weights[p_idx] = p.log_weight W, normalized_weights = self._normalize(log_weights) # Step 15 of algorithm - # list_of_particles = self.resample(list_of_particles, normalized_weights) # resample all but first particle re_n_w = normalized_weights[1:] / normalized_weights[1:].sum() indices = range(1, len(list_of_particles)) @@ -95,16 +103,17 @@ def astep(self, q_0): # Step 16 of algorithm w_t = W - log_num_particles for c in range(self.num_particles): - list_of_particles[c].weight = w_t - - # ### XXX Revisar esto!!!! - # # Step 17 of algorithm - # non_available_nodes_for_expansion = np.ones(self.num_particles) - # for c in range(self.num_particles): - # if len(list_of_particles[c].expansion_nodes) != 0: - # non_available_nodes_for_expansion[c] = 0 - # if np.all(non_available_nodes_for_expansion): - # break + list_of_particles[c].log_weight = w_t + + ### Should we used this one!!! + # Step 17 of algorithm + # non_available_nodes_for_expansion = np.ones(self.num_particles) + # for c in range(self.num_particles): + # if len(list_of_particles[c].expansion_nodes) != 0: + # non_available_nodes_for_expansion[c] = 0 + # if np.all(non_available_nodes_for_expansion): + # break + ### Or this one !!! non_available_nodes_for_expansion = True for c in range(self.num_particles): if len(list_of_particles[c].expansion_nodes) != 0: @@ -113,7 +122,6 @@ def astep(self, q_0): if non_available_nodes_for_expansion: break - # print(idx, t, normalized_weights) new_tree = np.random.choice(list_of_particles, p=normalized_weights) self.previous_trees_particles_list[tree.tree_id] = new_tree bart.trees[idx] = new_tree.tree @@ -134,7 +142,7 @@ def competence(var, has_grad): def _normalize(self, log_w): # this function needs a home sweet home """ - use logsumexp trick to get W and softmax to get normalized_weights + Use logsumexp trick to get W and softmax to get normalized_weights """ log_w_max = log_w.max() log_w_ = log_w - log_w_max @@ -155,7 +163,7 @@ def get_previous_tree_particle(self, tree_id, t): def init_particles(self, tree_id, R_j, num_observations): list_of_particles = [] initial_value_leaf_nodes = R_j.mean() - initial_idx_data_points_leaf_nodes = np.array(range(num_observations), dtype="int32") + initial_idx_data_points_leaf_nodes = np.arange(num_observations, dtype="int32") new_tree = Tree.init_tree( tree_id=tree_id, leaf_node_value=initial_value_leaf_nodes, @@ -179,7 +187,7 @@ def __init__(self, tree): self.expansion_nodes = self.tree.idx_leaf_nodes.copy() # This should be the array [0] self.tree_history = [self.tree.copy()] self.expansion_nodes_history = [self.expansion_nodes.copy()] - self.weight = 0.0 + self.log_weight = 0.0 def sample_tree_sequential(self, bart): if self.expansion_nodes: From 43fed87f38710ff6a819912a9814314b5aeab7d6 Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Fri, 9 Oct 2020 08:39:47 -0300 Subject: [PATCH 05/24] blackify --- pymc3/distributions/bart.py | 6 ++++-- pymc3/distributions/tree.py | 2 +- pymc3/model.py | 27 ++++++++++++++++++++++----- pymc3/sampling.py | 20 +++++++++++++++++--- pymc3/step_methods/hmc/nuts.py | 2 +- pymc3/step_methods/pgbart.py | 1 + 6 files changed, 46 insertions(+), 12 deletions(-) diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py index 08f61bf1d3f..23f1a60997a 100644 --- a/pymc3/distributions/bart.py +++ b/pymc3/distributions/bart.py @@ -253,9 +253,11 @@ def draw_leaf_value(self, tree, idx_data_points): R_j = self.get_residuals(tree) node_responses = R_j[idx_data_points] - data_mean = node_responses.mean() # for skewed distribution use median or sample from data-points? + data_mean = ( + node_responses.mean() + ) # for skewed distribution use median or sample from data-points? data_std_scaled = node_responses.std() / self.m - draw = (data_mean + self._normal_dist_sampler.sample() * data_std_scaled) + draw = data_mean + self._normal_dist_sampler.sample() * data_std_scaled return draw diff --git a/pymc3/distributions/tree.py b/pymc3/distributions/tree.py index 8cc6d188227..2ec0cebc738 100644 --- a/pymc3/distributions/tree.py +++ b/pymc3/distributions/tree.py @@ -12,7 +12,7 @@ class TreeNodeError(Exception): class Tree: - """ Full binary tree + """Full binary tree A full binary tree is a tree where each node has exactly zero or two children. This structure is used as the basic component of the Bayesian Additive Regression Tree (BART) Attributes diff --git a/pymc3/model.py b/pymc3/model.py index 0b9f2e810b5..5d881c172b6 100644 --- a/pymc3/model.py +++ b/pymc3/model.py @@ -1144,7 +1144,11 @@ def Var(self, name, dist, data=None, total_size=None, dims=None): elif isinstance(data, dict): with self: var = MultiObservedRV( - name=name, data=data, distribution=dist, total_size=total_size, model=self, + name=name, + data=data, + distribution=dist, + total_size=total_size, + model=self, ) self.observed_RVs.append(var) if var.missing_values: @@ -1155,7 +1159,11 @@ def Var(self, name, dist, data=None, total_size=None, dims=None): else: with self: var = ObservedRV( - name=name, data=data, distribution=dist, total_size=total_size, model=self, + name=name, + data=data, + distribution=dist, + total_size=total_size, + model=self, ) self.observed_RVs.append(var) if var.missing_values: @@ -1654,7 +1662,10 @@ def __init__( self.scaling = _get_scaling(total_size, self.shape, self.ndim) incorporate_methods( - source=distribution, destination=self, methods=["random"], wrapper=InstanceMethod, + source=distribution, + destination=self, + methods=["random"], + wrapper=InstanceMethod, ) @property @@ -1710,7 +1721,10 @@ def as_tensor(data, name, model, distribution): testval = np.broadcast_to(distribution.default(), data.shape)[data.mask] fakedist = NoDistribution.dist( - shape=data.mask.sum(), dtype=dtype, testval=testval, parent_dist=distribution, + shape=data.mask.sum(), + dtype=dtype, + testval=testval, + parent_dist=distribution, ) missing_values = FreeRV(name=name + "_missing", distribution=fakedist, model=model) constant = tt.as_tensor_variable(data.filled()) @@ -1963,7 +1977,10 @@ def __init__( self.tag.test_value = normalRV.tag.test_value self.scaling = _get_scaling(total_size, self.shape, self.ndim) incorporate_methods( - source=distribution, destination=self, methods=["random"], wrapper=InstanceMethod, + source=distribution, + destination=self, + methods=["random"], + wrapper=InstanceMethod, ) @property diff --git a/pymc3/sampling.py b/pymc3/sampling.py index 65126c28911..c819abaeb05 100644 --- a/pymc3/sampling.py +++ b/pymc3/sampling.py @@ -659,7 +659,14 @@ def _check_start_shape(model, start): def _sample_many( - draws, chain: int, chains: int, start: list, random_seed: list, step, callback=None, **kwargs, + draws, + chain: int, + chains: int, + start: list, + random_seed: list, + step, + callback=None, + **kwargs, ): """Samples all chains sequentially. @@ -996,7 +1003,8 @@ def _iter_sample( if callback is not None: warns = getattr(step, "warnings", None) callback( - trace=strace, draw=Draw(chain, i == draws, i, i < tune, stats, point, warns), + trace=strace, + draw=Draw(chain, i == draws, i, i < tune, stats, point, warns), ) yield strace, diverging @@ -1949,7 +1957,13 @@ def sample_prior_predictive( def init_nuts( - init="auto", chains=1, n_init=500000, model=None, random_seed=None, progressbar=True, **kwargs, + init="auto", + chains=1, + n_init=500000, + model=None, + random_seed=None, + progressbar=True, + **kwargs, ): """Set up the mass matrix initialization for NUTS. diff --git a/pymc3/step_methods/hmc/nuts.py b/pymc3/step_methods/hmc/nuts.py index 5810470f300..7ffbf106d41 100644 --- a/pymc3/step_methods/hmc/nuts.py +++ b/pymc3/step_methods/hmc/nuts.py @@ -25,7 +25,7 @@ from pymc3.vartypes import continuous_types from ...distributions import BART -__all__ = ['NUTS'] +__all__ = ["NUTS"] class NUTS(BaseHMC): diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py index 2f60325c5f5..c7088fcd64c 100644 --- a/pymc3/step_methods/pgbart.py +++ b/pymc3/step_methods/pgbart.py @@ -26,6 +26,7 @@ class PGBART(ArrayStepShared): upper: float Upper limit. """ + name = "bartsampler" default_blocked = False From f847f6605c99d55937a7389f85b6b325f46b60a4 Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Fri, 9 Oct 2020 16:07:37 -0300 Subject: [PATCH 06/24] fix error residuals --- pymc3/distributions/bart.py | 24 +++++++++++++----------- pymc3/distributions/tree.py | 2 +- pymc3/step_methods/pgbart.py | 29 ++++++++++++++++++++++++++--- 3 files changed, 40 insertions(+), 15 deletions(-) diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py index 23f1a60997a..648cf401300 100644 --- a/pymc3/distributions/bart.py +++ b/pymc3/distributions/bart.py @@ -1,7 +1,7 @@ import numpy as np from .distribution import NoDistribution from .tree import Tree, SplitNode, LeafNode -from pymc3.util import get_variable_name +from pymc3.util import get_var_name # from ..step_methods.bart.exceptions import ( # BARTParamsError, @@ -89,7 +89,7 @@ def init_list_of_trees(self): # Taken from Section 3.1 of Kapelner, A and Bleich, J. bartMachine: A Powerful Tool for Machine Learning in R. ArXiv e-prints, 2013 # The sum_trees_output will contain the sum of the predicted output for all trees. # When R_j is needed we subtract the current predicted output for tree T_j. - self.sum_trees_output = np.ones_like(self.Y) * self.Y.mean() + self.sum_trees_output = np.full_like(self.Y, self.Y.mean()) return list_of_trees @@ -190,10 +190,14 @@ def get_new_idx_data_points(self, current_split_node, idx_data_points): return left_node_idx_data_points, right_node_idx_data_points - def get_residuals(self, tree): + def get_residuals_loo(self, tree): R_j = self.Y - (self.sum_trees_output - tree.predict_output(self.num_observations)) return R_j + def get_residuals(self): + R_j = self.Y - self.sum_trees_output + return R_j + def draw_leaf_value(self, tree, idx_data_points): raise NotImplementedError @@ -250,14 +254,12 @@ def _repr_latex_(self, name=None, dist=None): ) def draw_leaf_value(self, tree, idx_data_points): - R_j = self.get_residuals(tree) - node_responses = R_j[idx_data_points] - - data_mean = ( - node_responses.mean() - ) # for skewed distribution use median or sample from data-points? - data_std_scaled = node_responses.std() / self.m + R_j = self.get_residuals()[idx_data_points] - draw = data_mean + self._normal_dist_sampler.sample() * data_std_scaled + # for skewed distribution use median or sample from data-points? + # data_mean = R_j.mean() + # data_std_scaled = R_j.std() / self.m + # draw = data_mean + self._normal_dist_sampler.sample() * data_std_scaled + draw = R_j.mean() return draw diff --git a/pymc3/distributions/tree.py b/pymc3/distributions/tree.py index 2ec0cebc738..b7bfaa8b186 100644 --- a/pymc3/distributions/tree.py +++ b/pymc3/distributions/tree.py @@ -487,7 +487,7 @@ def __eq__(self, other): def prior_log_probability_node(self, alpha, beta): """ - Calculate the log probability of the node being a LeafNode. + Calculate the log probability of the node being a LeafNode (1 - p(being SplitNode)). Taken from equation 7 in [Chipman2010]. Parameters diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py index c7088fcd64c..879f217d476 100644 --- a/pymc3/step_methods/pgbart.py +++ b/pymc3/step_methods/pgbart.py @@ -5,7 +5,8 @@ from ..distributions.tree import Tree from ..model import modelcontext from ..theanof import inputvars, make_shared_replacements, floatX -from ..smc.smc import logp_forw + +# from ..smc.smc import logp_forw class PGBART(ArrayStepShared): @@ -30,7 +31,7 @@ class PGBART(ArrayStepShared): name = "bartsampler" default_blocked = False - def __init__(self, vars=None, num_particles=10, max_stages=5000, model=None): + def __init__(self, vars=None, num_particles=10, max_stages=1, model=None): model = modelcontext(model) vars = inputvars(vars) self.bart = vars[0].distribution @@ -54,7 +55,7 @@ def astep(self, q_0): output = np.zeros((bart.m, num_observations)) log_num_particles = np.log(self.num_particles) for idx, tree in enumerate(bart.trees): - R_j = bart.get_residuals(tree) + R_j = bart.get_residuals_loo(tree) bart.Y_shared.set_value(R_j) list_of_particles = self.init_particles(tree.tree_id, R_j, bart.num_observations) # Step 5 of algorithm @@ -221,3 +222,25 @@ def init_weight(self): def update_weight(self): # TODO pass + + +from theano import function as theano_function +from ..theanof import join_nonshared_inputs + + +def logp_forw(out_vars, vars, shared): + """Compile Theano function of the model and the input and output variables. + + Parameters + ---------- + out_vars: List + containing :class:`pymc3.Distribution` for the output variables + vars: List + containing :class:`pymc3.Distribution` for the input variables + shared: List + containing :class:`theano.tensor.Tensor` for depended shared data + """ + out_list, inarray0 = join_nonshared_inputs(out_vars, vars, shared) + f = theano_function([inarray0], out_list[0]) + f.trust_input = True + return f From 6700a74ca4a6567a58265a4e1b9d9a3f56487c8e Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Fri, 16 Oct 2020 16:02:42 -0300 Subject: [PATCH 07/24] use a low number of max_stages for the first iteration, remove not necessary errors --- pymc3/distributions/bart.py | 5 ---- pymc3/distributions/tree.py | 57 ------------------------------------ pymc3/step_methods/pgbart.py | 19 ++++++++++-- 3 files changed, 16 insertions(+), 65 deletions(-) diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py index 648cf401300..48c6d25464f 100644 --- a/pymc3/distributions/bart.py +++ b/pymc3/distributions/bart.py @@ -1,11 +1,6 @@ import numpy as np from .distribution import NoDistribution from .tree import Tree, SplitNode, LeafNode -from pymc3.util import get_var_name - -# from ..step_methods.bart.exceptions import ( -# BARTParamsError, -# ) # __all__ = ["BART"] diff --git a/pymc3/distributions/tree.py b/pymc3/distributions/tree.py index b7bfaa8b186..fd534b06cc8 100644 --- a/pymc3/distributions/tree.py +++ b/pymc3/distributions/tree.py @@ -3,14 +3,6 @@ from copy import deepcopy -class TreeStructureError(Exception): - """Base (catch-all) tree structure exception.""" - - -class TreeNodeError(Exception): - """Base (catch-all) tree node exception.""" - - class Tree: """Full binary tree A full binary tree is a tree where each node has exactly zero or two children. @@ -158,43 +150,16 @@ def copy(self): return deepcopy(self) def get_node(self, index): - if not isinstance(index, int) or index < 0: - raise TreeStructureError("Node index must be a non-negative int") - if index not in self.tree_structure: - raise TreeStructureError("Node missing at index {}".format(index)) return self.tree_structure[index] def set_node(self, index, node): - if not isinstance(index, int) or index < 0: - raise TreeStructureError("Node index must be a non-negative int") - if not isinstance(node, SplitNode) and not isinstance(node, LeafNode): - raise TreeStructureError("Node class must be SplitNode or LeafNode") - if index in self.tree_structure: - raise TreeStructureError("Node index already exist in tree") - if self.num_nodes == 0 and index != 0: - raise TreeStructureError("Root node must have index zero") - parent_index = node.get_idx_parent_node() - if self.num_nodes != 0 and parent_index not in self.tree_structure: - raise TreeStructureError("Invalid index, node must have a parent node") - if self.num_nodes != 0 and not isinstance(self.get_node(parent_index), SplitNode): - raise TreeStructureError("Parent node must be of class SplitNode") - if index != node.index: - raise TreeStructureError("Node must have same index as tree index") self.tree_structure[index] = node self.num_nodes += 1 if isinstance(node, LeafNode): self.idx_leaf_nodes.append(index) def delete_node(self, index): - if not isinstance(index, int) or index < 0: - raise TreeStructureError("Node index must be a non-negative int") - if index not in self.tree_structure: - raise TreeStructureError("Node missing at index {}".format(index)) current_node = self.get_node(index) - left_child_idx = current_node.get_idx_left_child() - right_child_idx = current_node.get_idx_right_child() - if left_child_idx in self.tree_structure or right_child_idx in self.tree_structure: - raise TreeStructureError("Invalid removal of node, leaving two orphans nodes") if isinstance(current_node, LeafNode): self.idx_leaf_nodes.remove(index) del self.tree_structure[index] @@ -314,12 +279,6 @@ def grow_tree(self, index_leaf_node, new_split_node, new_left_node, new_right_no new_right_node : LeafNode """ current_node = self.get_node(index_leaf_node) - if not isinstance(current_node, LeafNode): - raise TreeStructureError("The tree grows from the leaves") - if not isinstance(new_split_node, SplitNode): - raise TreeStructureError("The node that replaces the leaf node must be SplitNode") - if not isinstance(new_left_node, LeafNode) or not isinstance(new_right_node, LeafNode): - raise TreeStructureError("The new leaves must be LeafNode") self.delete_node(index_leaf_node) self.set_node(index_leaf_node, new_split_node) @@ -369,8 +328,6 @@ def get_tree_depth(self): class BaseNode: def __init__(self, index): - if not isinstance(index, int) or index < 0: - raise TreeNodeError("Node index must be a non-negative int") self.index = index self.depth = int(math.floor(math.log(index + 1, 2))) @@ -397,11 +354,6 @@ class SplitNode(BaseNode): def __init__(self, index, idx_split_variable, split_value): super().__init__(index) - if not isinstance(idx_split_variable, int) or idx_split_variable < 0: - raise TreeNodeError("Index of split variable must be a non-negative int") - if not isinstance(split_value, float): - raise TreeNodeError("Node split value type must be float") - self.idx_split_variable = idx_split_variable self.split_value = split_value @@ -455,15 +407,6 @@ def prior_log_probability_node(self, alpha, beta): class LeafNode(BaseNode): def __init__(self, index, value, idx_data_points): super().__init__(index) - if not isinstance(value, float): - raise TreeNodeError("Leaf node value type must be float") - if ( - not isinstance(idx_data_points, np.ndarray) - or idx_data_points.dtype.type is not np.int32 - ): - raise TreeNodeError("Index of data points must be a numpy.ndarray of np.int32") - if len(idx_data_points) == 0: - raise TreeNodeError("Index of data points can not be empty") self.value = value self.idx_data_points = idx_data_points diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py index 879f217d476..43784b28b8b 100644 --- a/pymc3/step_methods/pgbart.py +++ b/pymc3/step_methods/pgbart.py @@ -4,7 +4,7 @@ from ..distributions import BART from ..distributions.tree import Tree from ..model import modelcontext -from ..theanof import inputvars, make_shared_replacements, floatX +from ..theanof import inputvars, make_shared_replacements # from ..smc.smc import logp_forw @@ -31,13 +31,14 @@ class PGBART(ArrayStepShared): name = "bartsampler" default_blocked = False - def __init__(self, vars=None, num_particles=10, max_stages=1, model=None): + def __init__(self, vars=None, num_particles=10, max_stages=5000, model=None): model = modelcontext(model) vars = inputvars(vars) self.bart = vars[0].distribution self.num_particles = num_particles self.max_stages = max_stages + self.first_iteration = True self.previous_trees_particles_list = [] for i in range(self.bart.m): p = Particle(self.bart.trees[i]) @@ -48,8 +49,18 @@ def __init__(self, vars=None, num_particles=10, max_stages=1, model=None): super().__init__(vars, shared) def astep(self, q_0): + # For the first iteration we restrict max_stages to a low number, otherwise it is almsot sure + # we will reach max_stages given that our fist set of m trees is not good at all. + # maybe we can set max_stages by using some function of the number of variables/dimensions. + if self.first_iteration: + max_stages = 5 + self.iteration = False + else: + max_stages = self.max_stages + # Step 4 of algorithm bart = self.bart + num_observations = bart.num_observations likelihood_logp = self.likelihood_logp output = np.zeros((bart.m, num_observations)) @@ -57,6 +68,8 @@ def astep(self, q_0): for idx, tree in enumerate(bart.trees): R_j = bart.get_residuals_loo(tree) bart.Y_shared.set_value(R_j) + # generate an initial set of SMC particles + # At the end of the algorith we return one of these particles as a new tree list_of_particles = self.init_particles(tree.tree_id, R_j, bart.num_observations) # Step 5 of algorithm old_likelihoods = np.array( @@ -77,7 +90,7 @@ def astep(self, q_0): for p_idx, p in enumerate(list_of_particles): p.log_weight = log_weights[p_idx] - for t in range(1, self.max_stages + 2): + for t in range(1, max_stages): # Step 7 of algorithm list_of_particles[0] = self.get_previous_tree_particle(tree.tree_id, t) # This should be embarrassingly parallelizable From ac96b1a38004884090586c6e4aacc0cff11ed821 Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Fri, 16 Oct 2020 20:21:33 -0300 Subject: [PATCH 08/24] use Rockova prior, refactor prior leaf prob computaion --- pymc3/distributions/bart.py | 25 ++++++-------------- pymc3/distributions/tree.py | 44 ------------------------------------ pymc3/step_methods/pgbart.py | 43 +++++++++++++++++++++++++++++++---- 3 files changed, 45 insertions(+), 67 deletions(-) diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py index 48c6d25464f..80aa5200963 100644 --- a/pymc3/distributions/bart.py +++ b/pymc3/distributions/bart.py @@ -10,7 +10,7 @@ class BARTParamsError(Exception): class BaseBART(NoDistribution): - def __init__(self, X, Y, m=200, alpha=0.95, beta=2.0, cache_size=5000, *args, **kwargs): + def __init__(self, X, Y, m=200, alpha=0.25, cache_size=5000, *args, **kwargs): self.Y_shared = Y self.X = X @@ -45,26 +45,16 @@ def __init__(self, X, Y, m=200, alpha=0.95, beta=2.0, cache_size=5000, *args, ** raise BARTParamsError( "The type for the alpha parameter for the tree structure must be float" ) - if alpha <= 0 or 1 <= alpha: + if alpha <= 0 or 0.5 <= alpha: raise BARTParamsError( "The value for the alpha parameter for the tree structure " - "must be in the interval (0, 1)" - ) - if not isinstance(beta, float): - raise BARTParamsError( - "The type for the beta parameter for the tree structure must be float" - ) - if beta < 0: - raise BARTParamsError( - "The value for the beta parameter for the tree structure " - 'must be in the interval [0, float("inf"))' + "must be in the interval (0, 0.5)" ) self.num_observations = X.shape[0] self.number_variates = X.shape[1] self.m = m self.alpha = alpha - self.beta = beta self._normal_dist_sampler = NormalDistributionSampler(cache_size) self._disc_uniform_dist_sampler = DiscreteUniformDistributionSampler(cache_size) self.trees = self.init_list_of_trees() @@ -231,8 +221,8 @@ def refresh_cache(self): class BART(BaseBART): - def __init__(self, X, Y, m=200, alpha=0.95, beta=2.0): - super().__init__(X, Y, m, alpha, beta) + def __init__(self, X, Y, m=200, alpha=0.25): + super().__init__(X, Y, m, alpha) def _repr_latex_(self, name=None, dist=None): if dist is None: @@ -241,11 +231,10 @@ def _repr_latex_(self, name=None, dist=None): Y = (type(self.Y),) m = (self.m,) alpha = self.alpha - beta = self.beta m = self.m name = r"\text{%s}" % name - return r"""${} \sim \text{{BART}}(\mathit{{alpha}}={},~\mathit{{beta}}={},~\mathit{{m}}={})$""".format( - name, alpha, beta, m + return r"""${} \sim \text{{BART}}(\mathit{{alpha}}={},~\mathit{{m}}={})$""".format( + name, alpha, m ) def draw_leaf_value(self, tree, idx_data_points): diff --git a/pymc3/distributions/tree.py b/pymc3/distributions/tree.py index fd534b06cc8..39fffcc7906 100644 --- a/pymc3/distributions/tree.py +++ b/pymc3/distributions/tree.py @@ -381,28 +381,6 @@ def evaluate_splitting_rule(self, x): else: return x[self.idx_split_variable] <= self.split_value - def prior_log_probability_node(self, alpha, beta): - """ - Calculate the log probability of the node being a SplitNode. - Taken from equation 7 in [Chipman2010]. - - Parameters - ---------- - alpha : float - beta : float - - Returns - ------- - float - - References - ---------- - .. [Chipman2010] Chipman, H. A., George, E. I., & McCulloch, R. E. (2010). BART: Bayesian - additive regression trees. The Annals of Applied Statistics, 4(1), 266-298., - `link `__ - """ - return np.log(alpha * np.power(1.0 + self.depth, -beta)) - class LeafNode(BaseNode): def __init__(self, index, value, idx_data_points): @@ -427,25 +405,3 @@ def __eq__(self, other): ) else: return NotImplemented - - def prior_log_probability_node(self, alpha, beta): - """ - Calculate the log probability of the node being a LeafNode (1 - p(being SplitNode)). - Taken from equation 7 in [Chipman2010]. - - Parameters - ---------- - alpha : float - beta : float - - Returns - ------- - float - - References - ---------- - .. [Chipman2010] Chipman, H. A., George, E. I., & McCulloch, R. E. (2010). BART: Bayesian - additive regression trees. The Annals of Applied Statistics, 4(1), 266-298., - `link `__ - """ - return np.log(1.0 - alpha * np.power(1.0 + self.depth, -beta)) diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py index 43784b28b8b..fee09b6649c 100644 --- a/pymc3/step_methods/pgbart.py +++ b/pymc3/step_methods/pgbart.py @@ -35,13 +35,14 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, model=None): model = modelcontext(model) vars = inputvars(vars) self.bart = vars[0].distribution + self.prior_prob_leaf_node = _compute_prior_probability(self.bart.alpha) self.num_particles = num_particles self.max_stages = max_stages self.first_iteration = True self.previous_trees_particles_list = [] for i in range(self.bart.m): - p = Particle(self.bart.trees[i]) + p = Particle(self.bart.trees[i], self.prior_prob_leaf_node) self.previous_trees_particles_list.append(p) shared = make_shared_replacements(vars, model) @@ -185,7 +186,7 @@ def init_particles(self, tree_id, R_j, num_observations): idx_data_points=initial_idx_data_points_leaf_nodes, ) for _ in range(self.num_particles): - new_particle = Particle(new_tree) + new_particle = Particle(new_tree, self.prior_prob_leaf_node) list_of_particles.append(new_particle) return list_of_particles @@ -197,20 +198,24 @@ def resample(self, list_of_particles, normalized_weights): class Particle: - def __init__(self, tree): + def __init__(self, tree, prior_prob_leaf_node): self.tree = tree.copy() # Mantiene el arbol que nos interesa en este momento self.expansion_nodes = self.tree.idx_leaf_nodes.copy() # This should be the array [0] self.tree_history = [self.tree.copy()] self.expansion_nodes_history = [self.expansion_nodes.copy()] self.log_weight = 0.0 + self.prior_prob_leaf_node = prior_prob_leaf_node def sample_tree_sequential(self, bart): if self.expansion_nodes: index_leaf_node = self.expansion_nodes.pop(0) # Probability that this node will remain a leaf node - log_prob = self.tree[index_leaf_node].prior_log_probability_node(bart.alpha, bart.beta) + try: + prob_leaf = self.prior_prob_leaf_node[self.tree[index_leaf_node].depth] + except IndexError: + prob_leaf = 1 - if np.exp(log_prob) < np.random.random(): + if prob_leaf < np.random.random(): self.grow_successful = bart.grow_tree(self.tree, index_leaf_node) # TODO: in case the grow_tree fails, should we try to sample the tree from another leaf node? if self.grow_successful: @@ -237,6 +242,34 @@ def update_weight(self): pass +def _compute_prior_probability(alpha): + """ + Calculate the probability of the node being a LeafNode (1 - p(being SplitNode)). + Taken from equation 19 in [On Theory for BART]. XXX FIX reference below! + + Parameters + ---------- + alpha : float + + Returns + ------- + float + + References + ---------- + .. [Chipman2010] Chipman, H. A., George, E. I., & McCulloch, R. E. (2010). BART: Bayesian + additive regression trees. The Annals of Applied Statistics, 4(1), 266-298., + `link `__ + """ + prior_leaf_prob = [0] + depth = 1 + prob = 1 + while prob < 1: + prob = 1 - alpha ** depth + depth += 1 + return prior_leaf_prob + + from theano import function as theano_function from ..theanof import join_nonshared_inputs From 7d54bfa71985494b514c7ebcc9dd3149d45dd33b Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Mon, 19 Oct 2020 16:25:07 -0300 Subject: [PATCH 09/24] clean code add docstring --- pymc3/distributions/bart.py | 42 ++++++++++++++++++++---------------- pymc3/step_methods/pgbart.py | 14 ++++++------ 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py index 80aa5200963..16c9a84301c 100644 --- a/pymc3/distributions/bart.py +++ b/pymc3/distributions/bart.py @@ -10,11 +10,12 @@ class BARTParamsError(Exception): class BaseBART(NoDistribution): - def __init__(self, X, Y, m=200, alpha=0.25, cache_size=5000, *args, **kwargs): + def __init__(self, X, Y, m=200, alpha=0.25, *args, **kwargs): self.Y_shared = Y self.X = X self.Y = Y.eval() + self.cache_size = 1000 super().__init__( shape=X.shape[0], dtype="float64", testval=0, *args, **kwargs ) # FIXME dtype and testvalue are nonsensical @@ -45,18 +46,17 @@ def __init__(self, X, Y, m=200, alpha=0.25, cache_size=5000, *args, **kwargs): raise BARTParamsError( "The type for the alpha parameter for the tree structure must be float" ) - if alpha <= 0 or 0.5 <= alpha: + if alpha <= 0 or 1 <= alpha: raise BARTParamsError( "The value for the alpha parameter for the tree structure " - "must be in the interval (0, 0.5)" + "must be in the interval (0, 1)" ) self.num_observations = X.shape[0] self.number_variates = X.shape[1] self.m = m self.alpha = alpha - self._normal_dist_sampler = NormalDistributionSampler(cache_size) - self._disc_uniform_dist_sampler = DiscreteUniformDistributionSampler(cache_size) + self._disc_uniform_dist_sampler = DiscreteUniformDistributionSampler(self.cache_size) self.trees = self.init_list_of_trees() def init_list_of_trees(self): @@ -206,21 +206,25 @@ def refresh_cache(self): self._cache = list(np.random.random(size=self._cache_size)) -class NormalDistributionSampler: - def __init__(self, cache_size=1000): - self._cache_size = cache_size - self._cache = [] - - def sample(self): - if len(self._cache) == 0: - self.refresh_cache() - return self._cache.pop() - - def refresh_cache(self): - self._cache = list(np.random.normal(size=self._cache_size)) - - class BART(BaseBART): + """ + BART distribution. + + Distributon representing a sum over trees + + Parameters + ---------- + X : + The design matrix. + Y : + The response vector. + m : int + Number of trees + alpha : float + Control the prior probability over the depth of the trees. Must be in the interval (0, 1), + altought it is recomenned to be between in the interval (0, 0.5]. + """ + def __init__(self, X, Y, m=200, alpha=0.25): super().__init__(X, Y, m, alpha) diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py index fee09b6649c..e5d21a3dc39 100644 --- a/pymc3/step_methods/pgbart.py +++ b/pymc3/step_methods/pgbart.py @@ -210,10 +210,11 @@ def sample_tree_sequential(self, bart): if self.expansion_nodes: index_leaf_node = self.expansion_nodes.pop(0) # Probability that this node will remain a leaf node - try: - prob_leaf = self.prior_prob_leaf_node[self.tree[index_leaf_node].depth] - except IndexError: - prob_leaf = 1 + # try: + # print(self.tree[index_leaf_node].depth) + prob_leaf = self.prior_prob_leaf_node[self.tree[index_leaf_node].depth] + # except IndexError: + # prob_leaf = 1 if prob_leaf < np.random.random(): self.grow_successful = bart.grow_tree(self.tree, index_leaf_node) @@ -263,9 +264,8 @@ def _compute_prior_probability(alpha): """ prior_leaf_prob = [0] depth = 1 - prob = 1 - while prob < 1: - prob = 1 - alpha ** depth + while prior_leaf_prob[-1] < 1: + prior_leaf_prob.append(1 - alpha ** depth) depth += 1 return prior_leaf_prob From b566d50dc5c702a81fb4d87bab51b3f1f4e1c143 Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Tue, 20 Oct 2020 18:44:34 -0300 Subject: [PATCH 10/24] reduce code --- pymc3/distributions/bart.py | 77 +++-------- pymc3/distributions/tree.py | 242 +---------------------------------- pymc3/step_methods/pgbart.py | 28 ++-- 3 files changed, 30 insertions(+), 317 deletions(-) diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py index 16c9a84301c..40189afa51e 100644 --- a/pymc3/distributions/bart.py +++ b/pymc3/distributions/bart.py @@ -2,7 +2,7 @@ from .distribution import NoDistribution from .tree import Tree, SplitNode, LeafNode -# __all__ = ["BART"] +__all__ = ["BART"] class BARTParamsError(Exception): @@ -15,7 +15,6 @@ def __init__(self, X, Y, m=200, alpha=0.25, *args, **kwargs): self.Y_shared = Y self.X = X self.Y = Y.eval() - self.cache_size = 1000 super().__init__( shape=X.shape[0], dtype="float64", testval=0, *args, **kwargs ) # FIXME dtype and testvalue are nonsensical @@ -27,11 +26,11 @@ def __init__(self, X, Y, m=200, alpha=0.25, *args, **kwargs): ) if self.X.ndim != 2: raise BARTParamsError("The design matrix X must have two dimensions") - if not isinstance(self.Y, np.ndarray) or self.Y.dtype.type is not np.float64: - raise BARTParamsError( - "The response matrix Y type must be numpy.ndarray where every item" - " type is numpy.float64" - ) + # if not isinstance(self.Y, np.ndarray) or self.Y.dtype.type is not np.float64: + # raise BARTParamsError( + # "The response matrix Y type must be numpy.ndarray where every item" + # " type is numpy.float64" + # ) if self.Y.ndim != 1: raise BARTParamsError("The response matrix Y must have one dimension") if self.X.shape[0] != self.Y.shape[0]: @@ -42,10 +41,7 @@ def __init__(self, X, Y, m=200, alpha=0.25, *args, **kwargs): raise BARTParamsError("The number of trees m type must be int") if m < 1: raise BARTParamsError("The number of trees m must be greater than zero") - if not isinstance(alpha, float): - raise BARTParamsError( - "The type for the alpha parameter for the tree structure must be float" - ) + if alpha <= 0 or 1 <= alpha: raise BARTParamsError( "The value for the alpha parameter for the tree structure " @@ -56,7 +52,6 @@ def __init__(self, X, Y, m=200, alpha=0.25, *args, **kwargs): self.number_variates = X.shape[1] self.m = m self.alpha = alpha - self._disc_uniform_dist_sampler = DiscreteUniformDistributionSampler(self.cache_size) self.trees = self.init_list_of_trees() def init_list_of_trees(self): @@ -84,18 +79,6 @@ def __iter__(self): def __repr_latex(self): raise NotImplementedError - def prediction_untransformed(self, x): - sum_of_trees = 0.0 - for t in self.trees: - sum_of_trees += t.out_of_sample_predict(x=x) - return sum_of_trees - - def sample_dist_splitting_variable(self, value): - return self._disc_uniform_dist_sampler.sample(0, value) - - def sample_dist_splitting_rule_assignment(self, value): - return self._disc_uniform_dist_sampler.sample(0, value) - def get_available_predictors(self, idx_data_points_split_node): possible_splitting_variables = [] for j in range(self.number_variates): @@ -125,15 +108,13 @@ def grow_tree(self, tree, index_leaf_node): if not available_predictors: return successful_grow_tree - index_selected_predictor = self.sample_dist_splitting_variable(len(available_predictors)) + index_selected_predictor = discrete_uniform_sampler(len(available_predictors)) selected_predictor = available_predictors[index_selected_predictor] available_splitting_rules, _ = self.get_available_splitting_rules( current_node.idx_data_points, selected_predictor ) - index_selected_splitting_rule = self.sample_dist_splitting_rule_assignment( - len(available_splitting_rules) - ) + index_selected_splitting_rule = discrete_uniform_sampler(len(available_splitting_rules)) selected_splitting_rule = available_splitting_rules[index_selected_splitting_rule] new_split_node = SplitNode( @@ -146,8 +127,8 @@ def grow_tree(self, tree, index_leaf_node): new_split_node, current_node.idx_data_points ) - left_node_value = self.draw_leaf_value(tree, left_node_idx_data_points) - right_node_value = self.draw_leaf_value(tree, right_node_idx_data_points) + left_node_value = self.draw_leaf_value(left_node_idx_data_points) + right_node_value = self.draw_leaf_value(right_node_idx_data_points) new_left_node = LeafNode( index=current_node.get_idx_left_child(), @@ -183,27 +164,16 @@ def get_residuals(self): R_j = self.Y - self.sum_trees_output return R_j - def draw_leaf_value(self, tree, idx_data_points): - raise NotImplementedError - - -class DiscreteUniformDistributionSampler: - """ - Draw samples from a discrete uniform distribution. - Samples are uniformly distributed over the half-open interval [low, high) (includes low, but excludes high). - """ + def draw_leaf_value(self, idx_data_points): + # draw the residual mean + R_j = self.get_residuals()[idx_data_points] + draw = R_j.mean() - def __init__(self, cache_size=1000): - self._cache_size = cache_size - self._cache = [] + return draw - def sample(self, lower_limit, upper_limit): - if len(self._cache) == 0: - self.refresh_cache() - return int(lower_limit + (upper_limit - lower_limit) * self._cache.pop()) - def refresh_cache(self): - self._cache = list(np.random.random(size=self._cache_size)) +def discrete_uniform_sampler(upper_value): + return int(np.random.random() * upper_value) class BART(BaseBART): @@ -240,14 +210,3 @@ def _repr_latex_(self, name=None, dist=None): return r"""${} \sim \text{{BART}}(\mathit{{alpha}}={},~\mathit{{m}}={})$""".format( name, alpha, m ) - - def draw_leaf_value(self, tree, idx_data_points): - R_j = self.get_residuals()[idx_data_points] - - # for skewed distribution use median or sample from data-points? - # data_mean = R_j.mean() - # data_std_scaled = R_j.std() / self.m - # draw = data_mean + self._normal_dist_sampler.sample() * data_std_scaled - draw = R_j.mean() - - return draw diff --git a/pymc3/distributions/tree.py b/pymc3/distributions/tree.py index 39fffcc7906..5b7d9f2a99f 100644 --- a/pymc3/distributions/tree.py +++ b/pymc3/distributions/tree.py @@ -42,110 +42,6 @@ def __getitem__(self, index): def __setitem__(self, index, node): self.set_node(index, node) - def __delitem__(self, index): - self.delete_node(index) - - def __iter__(self): - return iter(self.tree_structure.values()) - - def __eq__(self, other): - # The idx_leaf_nodes and idx_prunable_split_nodes are transformed to sets to correctly check for equivalence - # in case the values are not ordered in the same way. - return ( - self.tree_structure == other.tree_structure - and self.num_nodes == other.num_nodes - and set(self.idx_leaf_nodes) == set(other.idx_leaf_nodes) - and set(self.idx_prunable_split_nodes) == set(other.idx_prunable_split_nodes) - and self.tree_id == other.tree_id - ) - - def __hash__(self): - # Method added to create a set of trees. - return 0 - - def __len__(self): - return len(self.tree_structure) - - def __repr__(self): - return "Tree(num_nodes={})".format(self.num_nodes) - - def __str__(self): - lines = self._build_tree_string(index=0, show_index=False, delimiter="-")[0] - return "\n" + "\n".join((line.rstrip() for line in lines)) - - def _build_tree_string(self, index, show_index=False, delimiter="-"): - """Recursively walk down the binary tree and build a pretty-print string. - - In each recursive call, a "box" of characters visually representing the - current (sub)tree is constructed line by line. Each line is padded with - whitespaces to ensure all lines in the box have the same length. Then the - box, its width, and start-end positions of its root node value repr string - (required for drawing branches) are sent up to the parent call. The parent - call then combines its left and right sub-boxes to build a larger box etc. - """ - if index not in self.tree_structure: - return [], 0, 0, 0 - - line1 = [] - line2 = [] - current_node = self.get_node(index) - if show_index: - node_repr = "{}{}{}".format(index, delimiter, str(current_node)) - else: - node_repr = str(current_node) - - new_root_width = gap_size = len(node_repr) - - left_child = current_node.get_idx_left_child() - right_child = current_node.get_idx_right_child() - - # Get the left and right sub-boxes, their widths, and root repr positions - l_box, l_box_width, l_root_start, l_root_end = self._build_tree_string( - left_child, show_index, delimiter - ) - r_box, r_box_width, r_root_start, r_root_end = self._build_tree_string( - right_child, show_index, delimiter - ) - - # Draw the branch connecting the current root node to the left sub-box - # Pad the line with whitespaces where necessary - if l_box_width > 0: - l_root = (l_root_start + l_root_end) // 2 + 1 - line1.append(" " * (l_root + 1)) - line1.append("_" * (l_box_width - l_root)) - line2.append(" " * l_root + "/") - line2.append(" " * (l_box_width - l_root)) - new_root_start = l_box_width + 1 - gap_size += 1 - else: - new_root_start = 0 - - # Draw the representation of the current root node - line1.append(node_repr) - line2.append(" " * new_root_width) - - # Draw the branch connecting the current root node to the right sub-box - # Pad the line with whitespaces where necessary - if r_box_width > 0: - r_root = (r_root_start + r_root_end) // 2 - line1.append("_" * r_root) - line1.append(" " * (r_box_width - r_root + 1)) - line2.append(" " * r_root + "\\") - line2.append(" " * (r_box_width - r_root)) - gap_size += 1 - new_root_end = new_root_start + new_root_width - 1 - - # Combine the left and right sub-boxes with the branches drawn above - gap = " " * gap_size - new_box = ["".join(line1), "".join(line2)] - for i in range(max(len(l_box), len(r_box))): - l_line = l_box[i] if i < len(l_box) else " " * l_box_width - r_line = r_box[i] if i < len(r_box) else " " * r_box_width - new_box.append(l_line + gap + r_line) - - # Return the new box, its width and its root repr positions - return new_box, len(new_box[0]), new_root_start, new_root_end - def copy(self): return deepcopy(self) @@ -165,60 +61,6 @@ def delete_node(self, index): del self.tree_structure[index] self.num_nodes -= 1 - def make_digraph(self, name="Tree"): - """Make graphviz Digraph of the tree. - - Parameters - ---------- - name : str - Name used for the Digraph. Useful to differentiate digraph of trees. - - Returns - ------- - graphviz.Digraph - """ - try: - import graphviz - except ImportError: - raise ImportError( - "This function requires the python library graphviz, along with binaries. " - "The easiest way to install all of this is by running\n\n" - "\tconda install -c conda-forge python-graphviz" - ) - graph = graphviz.Digraph(name) - graph = self._digraph_tree_traversal(0, graph) - return graph - - def _digraph_tree_traversal(self, index, graph): - if index not in self.tree_structure.keys(): - return graph - current_node = self.get_node(index) - style = "" - if isinstance(current_node, SplitNode): - shape = "box" - if index in self.idx_prunable_split_nodes: - style = "filled" - else: - shape = "ellipse" - node_name = "{}_{}".format(graph.name, index) - graph.node(name=node_name, label=str(current_node), shape=shape, style=style) - - parent_index = current_node.get_idx_parent_node() - if parent_index in self.tree_structure: - tail_name = "{}_{}".format(graph.name, parent_index) - head_name = "{}_{}".format(graph.name, index) - if current_node.is_left_child(): - graph.edge(tail_name=tail_name, head_name=head_name, label="T") - else: - graph.edge(tail_name=tail_name, head_name=head_name, label="F") - - left_child = current_node.get_idx_left_child() - right_child = current_node.get_idx_right_child() - graph = self._digraph_tree_traversal(left_child, graph) - graph = self._digraph_tree_traversal(right_child, graph) - - return graph - def predict_output(self, num_observations): output = np.zeros(num_observations) for node_index in self.idx_leaf_nodes: @@ -226,22 +68,6 @@ def predict_output(self, num_observations): output[current_node.idx_data_points] = current_node.value return output - def out_of_sample_predict(self, x): - """ - Predict output of tree for an unobserved point. - - Parameters - ---------- - x : np.ndarray - - Returns - ------- - float - Value of the leaf value where the unobserved point lies. - """ - leaf_node = self._traverse_tree(x=x, node_index=0) - return leaf_node.value - def _traverse_tree(self, x, node_index=0): """ Traverse the tree starting from a particular node given an unobserved point. @@ -257,7 +83,8 @@ def _traverse_tree(self, x, node_index=0): """ current_node = self.get_node(node_index) if isinstance(current_node, SplitNode): - if current_node.evaluate_splitting_rule(x): + # if current_node.evaluate_splitting_rule(x): + if x is not np.NaN: left_child = current_node.get_idx_left_child() final_node = self._traverse_tree(x, left_child) else: @@ -311,29 +138,12 @@ def init_tree(tree_id, leaf_node_value, idx_data_points): new_tree[0] = LeafNode(index=0, value=leaf_node_value, idx_data_points=idx_data_points) return new_tree - def get_tree_depth(self): - """ - - Returns - ------- - - """ - max_depth = 0 - for idx_leaf_node in self.idx_leaf_nodes: - leaf_node = self.get_node(idx_leaf_node) - if max_depth < leaf_node.depth: - max_depth = leaf_node.depth - return max_depth - class BaseNode: def __init__(self, index): self.index = index self.depth = int(math.floor(math.log(index + 1, 2))) - def __eq__(self, other): - return self.index == other.index and self.depth == other.depth - def get_idx_parent_node(self): return (self.index - 1) // 2 @@ -343,12 +153,6 @@ def get_idx_left_child(self): def get_idx_right_child(self): return self.get_idx_left_child() + 1 - def is_left_child(self): - return bool(self.index % 2) - - def get_idx_sibling(self): - return (self.index + 1) if self.is_left_child() else (self.index - 1) - class SplitNode(BaseNode): def __init__(self, index, idx_split_variable, split_value): @@ -357,51 +161,9 @@ def __init__(self, index, idx_split_variable, split_value): self.idx_split_variable = idx_split_variable self.split_value = split_value - def __repr__(self): - return "SplitNode(index={}, idx_split_variable={}, split_value={})".format( - self.index, self.idx_split_variable, self.split_value - ) - - def __str__(self): - return "x[{}] <= {}".format(self.idx_split_variable, self.split_value) - - def __eq__(self, other): - if isinstance(other, SplitNode): - return ( - super().__eq__(other) - and self.idx_split_variable == other.idx_split_variable - and self.split_value == other.split_value - ) - else: - return NotImplemented - - def evaluate_splitting_rule(self, x): - if x is np.NaN: - return False - else: - return x[self.idx_split_variable] <= self.split_value - class LeafNode(BaseNode): def __init__(self, index, value, idx_data_points): super().__init__(index) self.value = value self.idx_data_points = idx_data_points - - def __repr__(self): - return "LeafNode(index={}, value={}, len(idx_data_points)={})".format( - self.index, self.value, len(self.idx_data_points) - ) - - def __str__(self): - return "{}".format(self.value) - - def __eq__(self, other): - if isinstance(other, LeafNode): - return ( - super().__eq__(other) - and self.value == other.value - and np.array_equal(self.idx_data_points, other.idx_data_points) - ) - else: - return NotImplemented diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py index e5d21a3dc39..4576807c0a7 100644 --- a/pymc3/step_methods/pgbart.py +++ b/pymc3/step_methods/pgbart.py @@ -199,10 +199,12 @@ def resample(self, list_of_particles, normalized_weights): class Particle: def __init__(self, tree, prior_prob_leaf_node): - self.tree = tree.copy() # Mantiene el arbol que nos interesa en este momento - self.expansion_nodes = self.tree.idx_leaf_nodes.copy() # This should be the array [0] - self.tree_history = [self.tree.copy()] - self.expansion_nodes_history = [self.expansion_nodes.copy()] + self.tree = tree.copy() # keeps the tree that we care at the moment + self.expansion_nodes = tree.idx_leaf_nodes.copy() # This should be the array [0] + # self.tree_history = [self.tree.copy()] + # self.expansion_nodes_history = [self.expansion_nodes.copy()] + self.tree_history = [self.tree] + self.expansion_nodes_history = [self.expansion_nodes] self.log_weight = 0.0 self.prior_prob_leaf_node = prior_prob_leaf_node @@ -210,11 +212,7 @@ def sample_tree_sequential(self, bart): if self.expansion_nodes: index_leaf_node = self.expansion_nodes.pop(0) # Probability that this node will remain a leaf node - # try: - # print(self.tree[index_leaf_node].depth) prob_leaf = self.prior_prob_leaf_node[self.tree[index_leaf_node].depth] - # except IndexError: - # prob_leaf = 1 if prob_leaf < np.random.random(): self.grow_successful = bart.grow_tree(self.tree, index_leaf_node) @@ -223,8 +221,10 @@ def sample_tree_sequential(self, bart): # Add new leaf nodes indexes new_indexes = self.tree.idx_leaf_nodes[-2:] self.expansion_nodes.extend(new_indexes) - self.tree_history.append(self.tree.copy()) - self.expansion_nodes_history.append(self.expansion_nodes.copy()) + # self.tree_history.append(self.tree.copy()) + # self.expansion_nodes_history.append(self.expansion_nodes.copy()) + self.tree_history.append(self.tree) + self.expansion_nodes_history.append(self.expansion_nodes) def set_particle_to_step(self, t): if len(self.tree_history) <= t: @@ -234,14 +234,6 @@ def set_particle_to_step(self, t): self.tree = self.tree_history[t] self.expansion_nodes = self.expansion_nodes_history[t] - def init_weight(self): - # TODO - return 1.0 - - def update_weight(self): - # TODO - pass - def _compute_prior_probability(alpha): """ From 0ff5833ddfb9d67b81d1892e52aa1ea8bacb4bac Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Wed, 21 Oct 2020 02:35:01 -0300 Subject: [PATCH 11/24] speed-up by fitting a subset of trees per step --- pymc3/distributions/bart.py | 5 +---- pymc3/step_methods/pgbart.py | 31 ++++++++++++++++++------------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py index 40189afa51e..49232b6146a 100644 --- a/pymc3/distributions/bart.py +++ b/pymc3/distributions/bart.py @@ -11,10 +11,8 @@ class BARTParamsError(Exception): class BaseBART(NoDistribution): def __init__(self, X, Y, m=200, alpha=0.25, *args, **kwargs): - - self.Y_shared = Y self.X = X - self.Y = Y.eval() + self.Y = Y super().__init__( shape=X.shape[0], dtype="float64", testval=0, *args, **kwargs ) # FIXME dtype and testvalue are nonsensical @@ -168,7 +166,6 @@ def draw_leaf_value(self, idx_data_points): # draw the residual mean R_j = self.get_residuals()[idx_data_points] draw = R_j.mean() - return draw diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py index 4576807c0a7..a664bf72814 100644 --- a/pymc3/step_methods/pgbart.py +++ b/pymc3/step_methods/pgbart.py @@ -6,8 +6,6 @@ from ..model import modelcontext from ..theanof import inputvars, make_shared_replacements -# from ..smc.smc import logp_forw - class PGBART(ArrayStepShared): """ @@ -31,15 +29,18 @@ class PGBART(ArrayStepShared): name = "bartsampler" default_blocked = False - def __init__(self, vars=None, num_particles=10, max_stages=5000, model=None): + def __init__(self, vars=None, num_particles=10, max_stages=100, chunk="auto", model=None): model = modelcontext(model) vars = inputvars(vars) self.bart = vars[0].distribution self.prior_prob_leaf_node = _compute_prior_probability(self.bart.alpha) + self.tune = True + self.idx = 0 + if chunk == "auto": + self.chunk = min(1, int(self.bart.m * 0.10)) self.num_particles = num_particles self.max_stages = max_stages - self.first_iteration = True self.previous_trees_particles_list = [] for i in range(self.bart.m): p = Particle(self.bart.trees[i], self.prior_prob_leaf_node) @@ -53,22 +54,27 @@ def astep(self, q_0): # For the first iteration we restrict max_stages to a low number, otherwise it is almsot sure # we will reach max_stages given that our fist set of m trees is not good at all. # maybe we can set max_stages by using some function of the number of variables/dimensions. - if self.first_iteration: + bart = self.bart + + if self.tune: max_stages = 5 - self.iteration = False else: max_stages = self.max_stages - # Step 4 of algorithm - bart = self.bart + if self.idx == bart.m: + self.idx = 0 + # Step 4 of algorithm num_observations = bart.num_observations likelihood_logp = self.likelihood_logp - output = np.zeros((bart.m, num_observations)) log_num_particles = np.log(self.num_particles) - for idx, tree in enumerate(bart.trees): + + for idx in range(self.idx, self.idx + self.chunk): + if idx > bart.m: + break + self.idx += 1 + tree = bart.trees[idx] R_j = bart.get_residuals_loo(tree) - bart.Y_shared.set_value(R_j) # generate an initial set of SMC particles # At the end of the algorith we return one of these particles as a new tree list_of_particles = self.init_particles(tree.tree_id, R_j, bart.num_observations) @@ -142,10 +148,9 @@ def astep(self, q_0): self.previous_trees_particles_list[tree.tree_id] = new_tree bart.trees[idx] = new_tree.tree new_prediction = new_tree.tree.predict_output(num_observations) - output[idx] = new_prediction bart.sum_trees_output = bart.Y - R_j + new_prediction - return np.sum(output, axis=0) + return bart.sum_trees_output @staticmethod def competence(var, has_grad): From 3419e70d4da871d540daf6048e94655e12cd3cc4 Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Wed, 21 Oct 2020 03:13:34 -0300 Subject: [PATCH 12/24] choose max --- pymc3/step_methods/pgbart.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py index a664bf72814..8f19a849e81 100644 --- a/pymc3/step_methods/pgbart.py +++ b/pymc3/step_methods/pgbart.py @@ -38,7 +38,7 @@ def __init__(self, vars=None, num_particles=10, max_stages=100, chunk="auto", mo self.tune = True self.idx = 0 if chunk == "auto": - self.chunk = min(1, int(self.bart.m * 0.10)) + self.chunk = max(1, int(self.bart.m * 0.1)) self.num_particles = num_particles self.max_stages = max_stages self.previous_trees_particles_list = [] From 51165d4b94d75908f2efcfea3be5bba5379fbd53 Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Wed, 21 Oct 2020 13:41:30 -0300 Subject: [PATCH 13/24] improve docstrings --- pymc3/distributions/bart.py | 60 ++++++++++++++++++++---------------- pymc3/step_methods/pgbart.py | 25 +++++++++------ 2 files changed, 49 insertions(+), 36 deletions(-) diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py index 49232b6146a..d87cbf1511b 100644 --- a/pymc3/distributions/bart.py +++ b/pymc3/distributions/bart.py @@ -4,31 +4,17 @@ __all__ = ["BART"] - -class BARTParamsError(Exception): - """Base (catch-all) BART hyper parameters exception.""" - - class BaseBART(NoDistribution): def __init__(self, X, Y, m=200, alpha=0.25, *args, **kwargs): self.X = X self.Y = Y super().__init__( shape=X.shape[0], dtype="float64", testval=0, *args, **kwargs - ) # FIXME dtype and testvalue are nonsensical + ) - if not isinstance(self.X, np.ndarray) or self.X.dtype.type is not np.float64: - raise BARTParamsError( - "The design matrix X type must be numpy.ndarray where every item" - " type is numpy.float64" - ) if self.X.ndim != 2: raise BARTParamsError("The design matrix X must have two dimensions") - # if not isinstance(self.Y, np.ndarray) or self.Y.dtype.type is not np.float64: - # raise BARTParamsError( - # "The response matrix Y type must be numpy.ndarray where every item" - # " type is numpy.float64" - # ) + if self.Y.ndim != 1: raise BARTParamsError("The response matrix Y must have one dimension") if self.X.shape[0] != self.Y.shape[0]: @@ -51,6 +37,7 @@ def __init__(self, X, Y, m=200, alpha=0.25, *args, **kwargs): self.m = m self.alpha = alpha self.trees = self.init_list_of_trees() + self.mean = fast_mean() def init_list_of_trees(self): initial_value_leaf_nodes = self.Y.mean() / self.m @@ -63,8 +50,8 @@ def init_list_of_trees(self): idx_data_points=initial_idx_data_points_leaf_nodes, ) list_of_trees.append(new_tree) - # Diff trick to speed computation of residuals. - # Taken from Section 3.1 of Kapelner, A and Bleich, J. bartMachine: A Powerful Tool for Machine Learning in R. ArXiv e-prints, 2013 + # Diff trick to speed computation of residuals. From Section 3.1 of Kapelner, A and Bleich, J. + # bartMachine: A Powerful Tool for Machine Learning in R. ArXiv e-prints, 2013 # The sum_trees_output will contain the sum of the predicted output for all trees. # When R_j is needed we subtract the current predicted output for tree T_j. self.sum_trees_output = np.full_like(self.Y, self.Y.mean()) @@ -154,22 +141,44 @@ def get_new_idx_data_points(self, current_split_node, idx_data_points): return left_node_idx_data_points, right_node_idx_data_points - def get_residuals_loo(self, tree): - R_j = self.Y - (self.sum_trees_output - tree.predict_output(self.num_observations)) - return R_j def get_residuals(self): + """Compute the residuals.""" R_j = self.Y - self.sum_trees_output return R_j + def get_residuals_loo(self, tree): + """Compute the residuals without leaving the passed tree out.""" + R_j = self.Y - (self.sum_trees_output - tree.predict_output(self.num_observations)) + return R_j + def draw_leaf_value(self, idx_data_points): - # draw the residual mean + """ Draw the residual mean.""" R_j = self.get_residuals()[idx_data_points] - draw = R_j.mean() + draw = self.mean(R_j) return draw +def fast_mean(): + """If available use Numba to speed up the computation of the mean.""" + try: + from numba import jit + except ImportError: + return np.mean + + @jit + def mean(a): + count = a.shape[0] + suma = 0 + for i in range(count): + suma += a[i] + return suma / count + + return mean + + def discrete_uniform_sampler(upper_value): + """Draw from the uniform distribuion with bounds [0, upper_value).""" return int(np.random.random() * upper_value) @@ -177,7 +186,7 @@ class BART(BaseBART): """ BART distribution. - Distributon representing a sum over trees + Distribution representing a sum over trees Parameters ---------- @@ -189,7 +198,7 @@ class BART(BaseBART): Number of trees alpha : float Control the prior probability over the depth of the trees. Must be in the interval (0, 1), - altought it is recomenned to be between in the interval (0, 0.5]. + altought it is recomenned to be in the interval (0, 0.5]. """ def __init__(self, X, Y, m=200, alpha=0.25): @@ -200,7 +209,6 @@ def _repr_latex_(self, name=None, dist=None): dist = self X = (type(self.X),) Y = (type(self.Y),) - m = (self.m,) alpha = self.alpha m = self.m name = r"\text{%s}" % name diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py index 8f19a849e81..5edbf6b3044 100644 --- a/pymc3/step_methods/pgbart.py +++ b/pymc3/step_methods/pgbart.py @@ -9,21 +9,26 @@ class PGBART(ArrayStepShared): """ - Write me!!! + Particle Gibss BART sampling step + + Parameters + ----------) + vars: list + List of variables for sampler + num_particles : int + Number of particles for the SMC sampler. Defaults to 10 + max_stages : int + Maximum number of Iterations of the SMC sampler Defaults to 100. + chunk = int + Number of trees fitted per step. Defaults to "auto", which is the 10% of the `m` trees. + model: PyMC Model + Optional model for sampling step. Defaults to None (taken from context). References ---------- .. [Lakshminarayanan2015] Lakshminarayanan, B. and Roy, D.M. and Teh, Y. W., (2015), Particle Gibbs for Bayesian Additive Regression Trees. - ArviX, - `link `__ - - Parameters - ---------- - lower: float - Lower limit. - upper: float - Upper limit. + ArviX, `link `__ """ name = "bartsampler" From 03758a4758f7065adc524b56f57ffc1f20d1b3c1 Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Thu, 22 Oct 2020 15:27:51 -0300 Subject: [PATCH 14/24] refactor and clean code --- pymc3/distributions/bart.py | 35 ++++++- pymc3/step_methods/pgbart.py | 197 +++++++++++++---------------------- 2 files changed, 103 insertions(+), 129 deletions(-) diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py index d87cbf1511b..e95753f6faf 100644 --- a/pymc3/distributions/bart.py +++ b/pymc3/distributions/bart.py @@ -4,13 +4,12 @@ __all__ = ["BART"] + class BaseBART(NoDistribution): def __init__(self, X, Y, m=200, alpha=0.25, *args, **kwargs): self.X = X self.Y = Y - super().__init__( - shape=X.shape[0], dtype="float64", testval=0, *args, **kwargs - ) + super().__init__(shape=X.shape[0], dtype="float64", testval=0, *args, **kwargs) if self.X.ndim != 2: raise BARTParamsError("The design matrix X must have two dimensions") @@ -38,6 +37,7 @@ def __init__(self, X, Y, m=200, alpha=0.25, *args, **kwargs): self.alpha = alpha self.trees = self.init_list_of_trees() self.mean = fast_mean() + self.prior_prob_leaf_node = compute_prior_probability(alpha) def init_list_of_trees(self): initial_value_leaf_nodes = self.Y.mean() / self.m @@ -50,7 +50,7 @@ def init_list_of_trees(self): idx_data_points=initial_idx_data_points_leaf_nodes, ) list_of_trees.append(new_tree) - # Diff trick to speed computation of residuals. From Section 3.1 of Kapelner, A and Bleich, J. + # Diff trick to speed computation of residuals. From Section 3.1 of Kapelner, A and Bleich, J. # bartMachine: A Powerful Tool for Machine Learning in R. ArXiv e-prints, 2013 # The sum_trees_output will contain the sum of the predicted output for all trees. # When R_j is needed we subtract the current predicted output for tree T_j. @@ -141,7 +141,6 @@ def get_new_idx_data_points(self, current_split_node, idx_data_points): return left_node_idx_data_points, right_node_idx_data_points - def get_residuals(self): """Compute the residuals.""" R_j = self.Y - self.sum_trees_output @@ -159,6 +158,32 @@ def draw_leaf_value(self, idx_data_points): return draw +def compute_prior_probability(alpha): + """ + Calculate the probability of the node being a LeafNode (1 - p(being SplitNode)). + Taken from equation 19 in [Rockova2018]. + + Parameters + ---------- + alpha : float + + Returns + ------- + float + + References + ---------- + .. [Rockova2018] Veronika Rockova, Enakshi Saha (2018). On the theory of BART. + arXiv, `link `__ + """ + prior_leaf_prob = [0] + depth = 1 + while prior_leaf_prob[-1] < 1: + prior_leaf_prob.append(1 - alpha ** depth) + depth += 1 + return prior_leaf_prob + + def fast_mean(): """If available use Numba to speed up the computation of the mean.""" try: diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py index 5edbf6b3044..28257afedab 100644 --- a/pymc3/step_methods/pgbart.py +++ b/pymc3/step_methods/pgbart.py @@ -1,10 +1,11 @@ import numpy as np +from theano import function as theano_function from .arraystep import ArrayStepShared, Competence from ..distributions import BART from ..distributions.tree import Tree from ..model import modelcontext -from ..theanof import inputvars, make_shared_replacements +from ..theanof import inputvars, make_shared_replacements, join_nonshared_inputs class PGBART(ArrayStepShared): @@ -34,45 +35,45 @@ class PGBART(ArrayStepShared): name = "bartsampler" default_blocked = False - def __init__(self, vars=None, num_particles=10, max_stages=100, chunk="auto", model=None): + def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", model=None): model = modelcontext(model) vars = inputvars(vars) self.bart = vars[0].distribution - self.prior_prob_leaf_node = _compute_prior_probability(self.bart.alpha) self.tune = True self.idx = 0 if chunk == "auto": self.chunk = max(1, int(self.bart.m * 0.1)) self.num_particles = num_particles + self.log_num_particles = np.log(num_particles) + self.indices = list(range(1, num_particles)) + self.max_stages = max_stages self.previous_trees_particles_list = [] for i in range(self.bart.m): - p = Particle(self.bart.trees[i], self.prior_prob_leaf_node) + p = Particle(self.bart.trees[i], self.bart.prior_prob_leaf_node) self.previous_trees_particles_list.append(p) shared = make_shared_replacements(vars, model) - self.likelihood_logp = logp_forw([model.datalogpt], vars, shared) + self.likelihood_logp = logp([model.datalogpt], vars, shared) super().__init__(vars, shared) - def astep(self, q_0): - # For the first iteration we restrict max_stages to a low number, otherwise it is almsot sure - # we will reach max_stages given that our fist set of m trees is not good at all. - # maybe we can set max_stages by using some function of the number of variables/dimensions. + def astep(self, _): bart = self.bart - if self.tune: - max_stages = 5 - else: - max_stages = self.max_stages + # For the tunning phase we restrict max_stages to a low number, otherwise it is almost sure + # we will reach max_stages given that our first set of m trees is not good at all. + # Can set max_stages as a function of the number of variables/dimensions? + # if self.tune: + # max_stages = 5 + # else: + # max_stages = self.max_stages if self.idx == bart.m: self.idx = 0 # Step 4 of algorithm num_observations = bart.num_observations - likelihood_logp = self.likelihood_logp - log_num_particles = np.log(self.num_particles) for idx in range(self.idx, self.idx + self.chunk): if idx > bart.m: @@ -81,75 +82,46 @@ def astep(self, q_0): tree = bart.trees[idx] R_j = bart.get_residuals_loo(tree) # generate an initial set of SMC particles - # At the end of the algorith we return one of these particles as a new tree - list_of_particles = self.init_particles(tree.tree_id, R_j, bart.num_observations) - # Step 5 of algorithm - old_likelihoods = np.array( - [ - likelihood_logp(p.tree.predict_output(num_observations)) - for p in list_of_particles - ] - ) - log_weights = np.array(old_likelihoods) - - list_of_particles[0] = self.get_previous_tree_particle(tree.tree_id, 0) - log_weights[0] = likelihood_logp( - list_of_particles[0].tree.predict_output(num_observations) - ) - old_likelihoods[0] = log_weights[0] - log_weights -= log_num_particles - - for p_idx, p in enumerate(list_of_particles): - p.log_weight = log_weights[p_idx] + # at the end of the algorithm we return one of these particles as a new tree + particles = self.init_particles(tree.tree_id, R_j, bart.num_observations) for t in range(1, max_stages): - # Step 7 of algorithm - list_of_particles[0] = self.get_previous_tree_particle(tree.tree_id, t) - # This should be embarrassingly parallelizable + # get previous particle at stage t + particles[0] = self.get_previous_tree_particle(tree.tree_id, t) + # sample each particle (try to grow each tree) for c in range(1, self.num_particles): - # Step 9 of algorithm - list_of_particles[c].sample_tree_sequential(bart) - - # Step 12 of algorithm - for p_idx, p in enumerate(list_of_particles): - new_likelihood = likelihood_logp(p.tree.predict_output(num_observations)) - p.log_weight += new_likelihood - old_likelihoods[p_idx] - old_likelihoods[p_idx] = new_likelihood - log_weights[p_idx] = p.log_weight + particles[c].sample_tree_sequential(bart) + # Update weights + # Since the prior is used as the proposal,the weights are updated additively + # as the ratio of the new and old log_likelihoods + for p_idx, p in enumerate(particles): + new_likelihood = self.likelihood_logp(p.tree.predict_output(num_observations)) + p.log_weight += new_likelihood - p.old_likelihood_logp + p.old_likelihood_logp = new_likelihood + + # Normalize weights + W, normalized_weights = self.normalize(particles) - W, normalized_weights = self._normalize(log_weights) - - # Step 15 of algorithm # resample all but first particle re_n_w = normalized_weights[1:] / normalized_weights[1:].sum() - indices = range(1, len(list_of_particles)) - new_indices = np.random.choice(indices, size=len(list_of_particles) - 1, p=re_n_w) - list_of_particles[1:] = np.array(list_of_particles)[new_indices] - old_likelihoods[1:] = old_likelihoods[new_indices] - - # Step 16 of algorithm - w_t = W - log_num_particles - for c in range(self.num_particles): - list_of_particles[c].log_weight = w_t - - ### Should we used this one!!! - # Step 17 of algorithm - # non_available_nodes_for_expansion = np.ones(self.num_particles) - # for c in range(self.num_particles): - # if len(list_of_particles[c].expansion_nodes) != 0: - # non_available_nodes_for_expansion[c] = 0 - # if np.all(non_available_nodes_for_expansion): - # break - ### Or this one !!! - non_available_nodes_for_expansion = True - for c in range(self.num_particles): - if len(list_of_particles[c].expansion_nodes) != 0: - non_available_nodes_for_expansion = False - break - if non_available_nodes_for_expansion: + new_indices = np.random.choice(self.indices, size=len(self.indices), p=re_n_w) + particles[1:] = particles[new_indices] + + # Set the new weights + w_t = W - self.log_num_particles + for p in particles: + p.log_weight = w_t + + # Check if particles can keep growing, otherwise stop iterating + non_available_nodes_for_expansion = np.ones(self.num_particles - 1) + for c in range(1, self.num_particles): + if len(particles[c].expansion_nodes) != 0: + non_available_nodes_for_expansion[c - 1] = 0 + if np.all(non_available_nodes_for_expansion): break - new_tree = np.random.choice(list_of_particles, p=normalized_weights) + # Get the new tree and update + new_tree = np.random.choice(particles, p=normalized_weights) self.previous_trees_particles_list[tree.tree_id] = new_tree bart.trees[idx] = new_tree.tree new_prediction = new_tree.tree.predict_output(num_observations) @@ -166,10 +138,11 @@ def competence(var, has_grad): return Competence.IDEAL return Competence.INCOMPATIBLE - def _normalize(self, log_w): # this function needs a home sweet home + def normalize(self, particles): """ Use logsumexp trick to get W and softmax to get normalized_weights """ + log_w = np.array([p.log_weight for p in particles]) log_w_max = log_w.max() log_w_ = log_w - log_w_max w_ = np.exp(log_w_) @@ -187,7 +160,13 @@ def get_previous_tree_particle(self, tree_id, t): return previous_tree_particle def init_particles(self, tree_id, R_j, num_observations): - list_of_particles = [] + + prev_tree = self.get_previous_tree_particle(tree_id, 0) + likelihood = self.likelihood_logp(prev_tree.tree.predict_output(num_observations)) + prev_tree.old_likelihood_logp = likelihood + prev_tree.log_weight = likelihood - self.log_num_particles + particles = [prev_tree] + initial_value_leaf_nodes = R_j.mean() initial_idx_data_points_leaf_nodes = np.arange(num_observations, dtype="int32") new_tree = Tree.init_tree( @@ -195,28 +174,29 @@ def init_particles(self, tree_id, R_j, num_observations): leaf_node_value=initial_value_leaf_nodes, idx_data_points=initial_idx_data_points_leaf_nodes, ) - for _ in range(self.num_particles): - new_particle = Particle(new_tree, self.prior_prob_leaf_node) - list_of_particles.append(new_particle) - return list_of_particles - - def resample(self, list_of_particles, normalized_weights): - list_of_particles = np.random.choice( - list_of_particles, size=len(list_of_particles), p=normalized_weights - ) - return list_of_particles + likelihood_logp = self.likelihood_logp(new_tree.predict_output(num_observations)) + log_weight = likelihood_logp - self.log_num_particles + for i in range(1, self.num_particles): + particles.append( + Particle(new_tree, self.bart.prior_prob_leaf_node, log_weight, likelihood_logp) + ) + + return np.array(particles) + + def resample(self, particles, normalized_weights): + particles = np.random.choice(particles, size=len(particles), p=normalized_weights) + return particles class Particle: - def __init__(self, tree, prior_prob_leaf_node): + def __init__(self, tree, prior_prob_leaf_node, log_weight=0, likelihood=0): self.tree = tree.copy() # keeps the tree that we care at the moment self.expansion_nodes = tree.idx_leaf_nodes.copy() # This should be the array [0] - # self.tree_history = [self.tree.copy()] - # self.expansion_nodes_history = [self.expansion_nodes.copy()] self.tree_history = [self.tree] self.expansion_nodes_history = [self.expansion_nodes] - self.log_weight = 0.0 + self.log_weight = 0 self.prior_prob_leaf_node = prior_prob_leaf_node + self.old_likelihood_logp = likelihood def sample_tree_sequential(self, bart): if self.expansion_nodes: @@ -245,38 +225,7 @@ def set_particle_to_step(self, t): self.expansion_nodes = self.expansion_nodes_history[t] -def _compute_prior_probability(alpha): - """ - Calculate the probability of the node being a LeafNode (1 - p(being SplitNode)). - Taken from equation 19 in [On Theory for BART]. XXX FIX reference below! - - Parameters - ---------- - alpha : float - - Returns - ------- - float - - References - ---------- - .. [Chipman2010] Chipman, H. A., George, E. I., & McCulloch, R. E. (2010). BART: Bayesian - additive regression trees. The Annals of Applied Statistics, 4(1), 266-298., - `link `__ - """ - prior_leaf_prob = [0] - depth = 1 - while prior_leaf_prob[-1] < 1: - prior_leaf_prob.append(1 - alpha ** depth) - depth += 1 - return prior_leaf_prob - - -from theano import function as theano_function -from ..theanof import join_nonshared_inputs - - -def logp_forw(out_vars, vars, shared): +def logp(out_vars, vars, shared): """Compile Theano function of the model and the input and output variables. Parameters From c3c3929349ec9fb90794994f2ac49c1ab56c7a93 Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Thu, 22 Oct 2020 16:07:02 -0300 Subject: [PATCH 15/24] clean docstrings --- pymc3/distributions/tree.py | 1 - pymc3/step_methods/pgbart.py | 66 ++++++++++++++++++++---------------- 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/pymc3/distributions/tree.py b/pymc3/distributions/tree.py index 5b7d9f2a99f..35413e2f6d3 100644 --- a/pymc3/distributions/tree.py +++ b/pymc3/distributions/tree.py @@ -83,7 +83,6 @@ def _traverse_tree(self, x, node_index=0): """ current_node = self.get_node(node_index) if isinstance(current_node, SplitNode): - # if current_node.evaluate_splitting_rule(x): if x is not np.NaN: left_child = current_node.get_idx_left_child() final_node = self._traverse_tree(x, left_child) diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py index 28257afedab..850d12281b7 100644 --- a/pymc3/step_methods/pgbart.py +++ b/pymc3/step_methods/pgbart.py @@ -13,7 +13,7 @@ class PGBART(ArrayStepShared): Particle Gibss BART sampling step Parameters - ----------) + ---------- vars: list List of variables for sampler num_particles : int @@ -49,10 +49,10 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m self.indices = list(range(1, num_particles)) self.max_stages = max_stages - self.previous_trees_particles_list = [] + self.old_trees_particles_list = [] for i in range(self.bart.m): p = Particle(self.bart.trees[i], self.bart.prior_prob_leaf_node) - self.previous_trees_particles_list.append(p) + self.old_trees_particles_list.append(p) shared = make_shared_replacements(vars, model) self.likelihood_logp = logp([model.datalogpt], vars, shared) @@ -60,40 +60,37 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m def astep(self, _): bart = self.bart + num_observations = bart.num_observations # For the tunning phase we restrict max_stages to a low number, otherwise it is almost sure # we will reach max_stages given that our first set of m trees is not good at all. # Can set max_stages as a function of the number of variables/dimensions? - # if self.tune: - # max_stages = 5 - # else: - # max_stages = self.max_stages + if self.tune: + max_stages = 5 + else: + max_stages = self.max_stages if self.idx == bart.m: self.idx = 0 - # Step 4 of algorithm - num_observations = bart.num_observations - for idx in range(self.idx, self.idx + self.chunk): if idx > bart.m: break self.idx += 1 tree = bart.trees[idx] R_j = bart.get_residuals_loo(tree) - # generate an initial set of SMC particles - # at the end of the algorithm we return one of these particles as a new tree + # Generate an initial set of SMC particles + # at the end of the algorithm we return one of these particles as the new tree particles = self.init_particles(tree.tree_id, R_j, bart.num_observations) for t in range(1, max_stages): - # get previous particle at stage t - particles[0] = self.get_previous_tree_particle(tree.tree_id, t) + # Get old particle at stage t + particles[0] = self.get_old_tree_particle(tree.tree_id, t) # sample each particle (try to grow each tree) for c in range(1, self.num_particles): particles[c].sample_tree_sequential(bart) - # Update weights - # Since the prior is used as the proposal,the weights are updated additively - # as the ratio of the new and old log_likelihoods + # Update weights. Since the prior is used as the proposal,the weights + # are updated additively as the ratio of the new and old log_likelihoods for p_idx, p in enumerate(particles): new_likelihood = self.likelihood_logp(p.tree.predict_output(num_observations)) p.log_weight += new_likelihood - p.old_likelihood_logp @@ -102,7 +99,7 @@ def astep(self, _): # Normalize weights W, normalized_weights = self.normalize(particles) - # resample all but first particle + # Resample all but first particle re_n_w = normalized_weights[1:] / normalized_weights[1:].sum() new_indices = np.random.choice(self.indices, size=len(self.indices), p=re_n_w) particles[1:] = particles[new_indices] @@ -122,7 +119,7 @@ def astep(self, _): # Get the new tree and update new_tree = np.random.choice(particles, p=normalized_weights) - self.previous_trees_particles_list[tree.tree_id] = new_tree + self.old_trees_particles_list[tree.tree_id] = new_tree bart.trees[idx] = new_tree.tree new_prediction = new_tree.tree.predict_output(num_observations) bart.sum_trees_output = bart.Y - R_j + new_prediction @@ -154,19 +151,23 @@ def normalize(self, particles): return W, normalized_weights - def get_previous_tree_particle(self, tree_id, t): - previous_tree_particle = self.previous_trees_particles_list[tree_id] - previous_tree_particle.set_particle_to_step(t) - return previous_tree_particle + def get_old_tree_particle(self, tree_id, t): + old_tree_particle = self.old_trees_particles_list[tree_id] + old_tree_particle.set_particle_to_step(t) + return old_tree_particle def init_particles(self, tree_id, R_j, num_observations): - - prev_tree = self.get_previous_tree_particle(tree_id, 0) + """ + Initialize particles + """ + # The first particle is from the tree we are trying to replace + prev_tree = self.get_old_tree_particle(tree_id, 0) likelihood = self.likelihood_logp(prev_tree.tree.predict_output(num_observations)) prev_tree.old_likelihood_logp = likelihood prev_tree.log_weight = likelihood - self.log_num_particles particles = [prev_tree] + # The rest of the particles are identically initialized initial_value_leaf_nodes = R_j.mean() initial_idx_data_points_leaf_nodes = np.arange(num_observations, dtype="int32") new_tree = Tree.init_tree( @@ -183,12 +184,19 @@ def init_particles(self, tree_id, R_j, num_observations): return np.array(particles) - def resample(self, particles, normalized_weights): - particles = np.random.choice(particles, size=len(particles), p=normalized_weights) + def resample(self, particles, weights): + """ + resample a set of particles given its weights + """ + particles = np.random.choice(particles, size=len(particles), p=weights) return particles class Particle: + """ + Particle tree + """ + def __init__(self, tree, prior_prob_leaf_node, log_weight=0, likelihood=0): self.tree = tree.copy() # keeps the tree that we care at the moment self.expansion_nodes = tree.idx_leaf_nodes.copy() # This should be the array [0] @@ -206,13 +214,11 @@ def sample_tree_sequential(self, bart): if prob_leaf < np.random.random(): self.grow_successful = bart.grow_tree(self.tree, index_leaf_node) - # TODO: in case the grow_tree fails, should we try to sample the tree from another leaf node? if self.grow_successful: # Add new leaf nodes indexes new_indexes = self.tree.idx_leaf_nodes[-2:] self.expansion_nodes.extend(new_indexes) - # self.tree_history.append(self.tree.copy()) - # self.expansion_nodes_history.append(self.expansion_nodes.copy()) + self.tree_history.append(self.tree) self.expansion_nodes_history.append(self.expansion_nodes) From 6a58daaeaf02ae8a67ff74b608db4b1844bf7e92 Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Fri, 23 Oct 2020 10:43:07 -0300 Subject: [PATCH 16/24] add tests and minor fixes. Co-authored-by: aloctavodia Co-authored-by: jmloyola --- pymc3/distributions/bart.py | 33 ++++++++++++++++++++++++--------- pymc3/distributions/tree.py | 14 ++++++++++++++ pymc3/step_methods/pgbart.py | 14 ++++++++++++++ 3 files changed, 52 insertions(+), 9 deletions(-) diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py index e95753f6faf..0896617d028 100644 --- a/pymc3/distributions/bart.py +++ b/pymc3/distributions/bart.py @@ -1,3 +1,17 @@ +# Copyright 2020 The PyMC Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy as np from .distribution import NoDistribution from .tree import Tree, SplitNode, LeafNode @@ -32,7 +46,7 @@ def __init__(self, X, Y, m=200, alpha=0.25, *args, **kwargs): ) self.num_observations = X.shape[0] - self.number_variates = X.shape[1] + self.num_variates = X.shape[1] self.m = m self.alpha = alpha self.trees = self.init_list_of_trees() @@ -66,7 +80,7 @@ def __repr_latex(self): def get_available_predictors(self, idx_data_points_split_node): possible_splitting_variables = [] - for j in range(self.number_variates): + for j in range(self.num_variates): x_j = self.X[idx_data_points_split_node, j] x_j = x_j[~np.isnan(x_j)] for i in range(1, len(x_j)): @@ -169,7 +183,7 @@ def compute_prior_probability(alpha): Returns ------- - float + list with probabilities for leaf nodes References ---------- @@ -203,7 +217,7 @@ def mean(a): def discrete_uniform_sampler(upper_value): - """Draw from the uniform distribuion with bounds [0, upper_value).""" + """Draw from the uniform distribution with bounds [0, upper_value).""" return int(np.random.random() * upper_value) @@ -229,14 +243,15 @@ class BART(BaseBART): def __init__(self, X, Y, m=200, alpha=0.25): super().__init__(X, Y, m, alpha) - def _repr_latex_(self, name=None, dist=None): + def _str_repr(self, name=None, dist=None, formatting="plain"): if dist is None: dist = self X = (type(self.X),) Y = (type(self.Y),) alpha = self.alpha m = self.m - name = r"\text{%s}" % name - return r"""${} \sim \text{{BART}}(\mathit{{alpha}}={},~\mathit{{m}}={})$""".format( - name, alpha, m - ) + + if formatting == "latex": + return f"$\\text{{{name}}} \\sim \\text{{BART}}(\\text{{alpha = }}\\text{{{alpha}}}, \\text{{m = }}\\text{{{m}}})$" + else: + return f"{name} ~ BART(alpha = {alpha}, m = {m})" diff --git a/pymc3/distributions/tree.py b/pymc3/distributions/tree.py index 35413e2f6d3..3b2c098e896 100644 --- a/pymc3/distributions/tree.py +++ b/pymc3/distributions/tree.py @@ -1,3 +1,17 @@ +# Copyright 2020 The PyMC Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy as np import math from copy import deepcopy diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py index 850d12281b7..726fea4cf83 100644 --- a/pymc3/step_methods/pgbart.py +++ b/pymc3/step_methods/pgbart.py @@ -1,3 +1,17 @@ +# Copyright 2020 The PyMC Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy as np from theano import function as theano_function From 9050469110b9ce00548cabbaf4c0a16e1bdca13f Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Fri, 23 Oct 2020 11:46:48 -0300 Subject: [PATCH 17/24] remove space. Co-authored-by: aloctavodia Co-authored-by: jmloyola --- pymc3/smc/smc.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pymc3/smc/smc.py b/pymc3/smc/smc.py index 69b94054da0..d6069122952 100644 --- a/pymc3/smc/smc.py +++ b/pymc3/smc/smc.py @@ -392,4 +392,3 @@ def __call__(self, posterior): if self.save_log_pseudolikelihood: self.save_lpl(elemwise) return elemwise.sum() - From 5fdd999adb1e8ae5a7ebf778f242343c08af36ee Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Thu, 29 Oct 2020 19:55:35 -0300 Subject: [PATCH 18/24] add variable importance report --- pymc3/distributions/bart.py | 8 ++------ pymc3/sampling.py | 4 ++++ pymc3/step_methods/pgbart.py | 30 +++++++++++++++++++++--------- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py index 0896617d028..f20f5bc0da3 100644 --- a/pymc3/distributions/bart.py +++ b/pymc3/distributions/bart.py @@ -99,23 +99,20 @@ def get_available_splitting_rules(self, idx_data_points_split_node, idx_split_va def grow_tree(self, tree, index_leaf_node): # This can be unsuccessful when there are not available predictors - successful_grow_tree = False current_node = tree.get_node(index_leaf_node) available_predictors = self.get_available_predictors(current_node.idx_data_points) if not available_predictors: - return successful_grow_tree + return False, None index_selected_predictor = discrete_uniform_sampler(len(available_predictors)) selected_predictor = available_predictors[index_selected_predictor] - available_splitting_rules, _ = self.get_available_splitting_rules( current_node.idx_data_points, selected_predictor ) index_selected_splitting_rule = discrete_uniform_sampler(len(available_splitting_rules)) selected_splitting_rule = available_splitting_rules[index_selected_splitting_rule] - new_split_node = SplitNode( index=index_leaf_node, idx_split_variable=selected_predictor, @@ -140,9 +137,8 @@ def grow_tree(self, tree, index_leaf_node): idx_data_points=right_node_idx_data_points, ) tree.grow_tree(index_leaf_node, new_split_node, new_left_node, new_right_node) - successful_grow_tree = True - return successful_grow_tree + return True, index_selected_predictor def get_new_idx_data_points(self, current_split_node, idx_data_points): idx_split_variable = current_split_node.idx_split_variable diff --git a/pymc3/sampling.py b/pymc3/sampling.py index c819abaeb05..5004d412362 100644 --- a/pymc3/sampling.py +++ b/pymc3/sampling.py @@ -606,6 +606,10 @@ def sample( trace.report._n_draws = n_draws trace.report._t_sampling = t_sampling + if "variable_inclusion" in trace.stat_names: + variable_inclusion = trace.get_sampler_stats("variable_inclusion")[::-draws] + trace.report.variable_importance = np.mean([vi / vi.sum() for vi in variable_inclusion], 0) + n_chains = len(trace.chains) _log.info( f'Sampling {n_chains} chain{"s" if n_chains > 1 else ""} for {n_tune:_d} tune and {n_draws:_d} draw iterations ' diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py index 726fea4cf83..8a2744ead2d 100644 --- a/pymc3/step_methods/pgbart.py +++ b/pymc3/step_methods/pgbart.py @@ -31,9 +31,9 @@ class PGBART(ArrayStepShared): vars: list List of variables for sampler num_particles : int - Number of particles for the SMC sampler. Defaults to 10 + Number of particles for the conditional SMC sampler. Defaults to 10 max_stages : int - Maximum number of Iterations of the SMC sampler Defaults to 100. + Maximum number of iterations of the conditional SMC sampler. Defaults to 100. chunk = int Number of trees fitted per step. Defaults to "auto", which is the 10% of the `m` trees. model: PyMC Model @@ -48,6 +48,8 @@ class PGBART(ArrayStepShared): name = "bartsampler" default_blocked = False + generates_stats = True + stats_dtypes = [{"variable_inclusion": np.ndarray}] def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", model=None): model = modelcontext(model) @@ -58,14 +60,14 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m self.idx = 0 if chunk == "auto": self.chunk = max(1, int(self.bart.m * 0.1)) + self.variable_inclusion = np.zeros(self.bart.num_variates) self.num_particles = num_particles self.log_num_particles = np.log(num_particles) self.indices = list(range(1, num_particles)) - self.max_stages = max_stages self.old_trees_particles_list = [] for i in range(self.bart.m): - p = Particle(self.bart.trees[i], self.bart.prior_prob_leaf_node) + p = ParticleTree(self.bart.trees[i], self.bart.prior_prob_leaf_node) self.old_trees_particles_list.append(p) shared = make_shared_replacements(vars, model) @@ -138,7 +140,13 @@ def astep(self, _): new_prediction = new_tree.tree.predict_output(num_observations) bart.sum_trees_output = bart.Y - R_j + new_prediction - return bart.sum_trees_output + if not self.tune: + for index in new_tree.used_variates: + self.variable_inclusion[index] += 1 + + stats = {"variable_inclusion": self.variable_inclusion} + + return bart.sum_trees_output, [stats] @staticmethod def competence(var, has_grad): @@ -193,7 +201,7 @@ def init_particles(self, tree_id, R_j, num_observations): log_weight = likelihood_logp - self.log_num_particles for i in range(1, self.num_particles): particles.append( - Particle(new_tree, self.bart.prior_prob_leaf_node, log_weight, likelihood_logp) + ParticleTree(new_tree, self.bart.prior_prob_leaf_node, log_weight, likelihood_logp) ) return np.array(particles) @@ -206,7 +214,7 @@ def resample(self, particles, weights): return particles -class Particle: +class ParticleTree: """ Particle tree """ @@ -219,6 +227,7 @@ def __init__(self, tree, prior_prob_leaf_node, log_weight=0, likelihood=0): self.log_weight = 0 self.prior_prob_leaf_node = prior_prob_leaf_node self.old_likelihood_logp = likelihood + self.used_variates = [] def sample_tree_sequential(self, bart): if self.expansion_nodes: @@ -227,11 +236,14 @@ def sample_tree_sequential(self, bart): prob_leaf = self.prior_prob_leaf_node[self.tree[index_leaf_node].depth] if prob_leaf < np.random.random(): - self.grow_successful = bart.grow_tree(self.tree, index_leaf_node) - if self.grow_successful: + grow_successful, index_selected_predictor = bart.grow_tree( + self.tree, index_leaf_node + ) + if grow_successful: # Add new leaf nodes indexes new_indexes = self.tree.idx_leaf_nodes[-2:] self.expansion_nodes.extend(new_indexes) + self.used_variates.append(index_selected_predictor) self.tree_history.append(self.tree) self.expansion_nodes_history.append(self.expansion_nodes) From acc5290dda071fbc459e84743c2e7cb04ba7d3dc Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Wed, 4 Nov 2020 11:58:17 -0300 Subject: [PATCH 19/24] use ValueError --- pymc3/distributions/bart.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py index f20f5bc0da3..cbffcda918d 100644 --- a/pymc3/distributions/bart.py +++ b/pymc3/distributions/bart.py @@ -26,21 +26,21 @@ def __init__(self, X, Y, m=200, alpha=0.25, *args, **kwargs): super().__init__(shape=X.shape[0], dtype="float64", testval=0, *args, **kwargs) if self.X.ndim != 2: - raise BARTParamsError("The design matrix X must have two dimensions") + raise ValueError("The design matrix X must have two dimensions") if self.Y.ndim != 1: - raise BARTParamsError("The response matrix Y must have one dimension") + raise ValueError("The response matrix Y must have one dimension") if self.X.shape[0] != self.Y.shape[0]: - raise BARTParamsError( + raise ValueError( "The design matrix X and the response matrix Y must have the same number of elements" ) if not isinstance(m, int): - raise BARTParamsError("The number of trees m type must be int") + raise ValueError("The number of trees m type must be int") if m < 1: - raise BARTParamsError("The number of trees m must be greater than zero") + raise ValueError("The number of trees m must be greater than zero") if alpha <= 0 or 1 <= alpha: - raise BARTParamsError( + raise ValueError( "The value for the alpha parameter for the tree structure " "must be in the interval (0, 1)" ) From 7ac976b47722a6e561d3eda1155ee34b1ef67283 Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Fri, 6 Nov 2020 16:05:30 -0300 Subject: [PATCH 20/24] wip return mean and std variable importance --- pymc3/distributions/bart.py | 5 ++--- pymc3/sampling.py | 7 +++++-- pymc3/step_methods/pgbart.py | 8 +++++--- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py index cbffcda918d..bba94f8d22b 100644 --- a/pymc3/distributions/bart.py +++ b/pymc3/distributions/bart.py @@ -144,10 +144,9 @@ def get_new_idx_data_points(self, current_split_node, idx_data_points): idx_split_variable = current_split_node.idx_split_variable split_value = current_split_node.split_value - left_idx = np.nonzero(self.X[idx_data_points, idx_split_variable] <= split_value) + left_idx = self.X[idx_data_points, idx_split_variable] <= split_value left_node_idx_data_points = idx_data_points[left_idx] - right_idx = np.nonzero(~(self.X[idx_data_points, idx_split_variable] <= split_value)) - right_node_idx_data_points = idx_data_points[right_idx] + right_node_idx_data_points = idx_data_points[~left_idx] return left_node_idx_data_points, right_node_idx_data_points diff --git a/pymc3/sampling.py b/pymc3/sampling.py index 5004d412362..a93a8aea079 100644 --- a/pymc3/sampling.py +++ b/pymc3/sampling.py @@ -607,8 +607,11 @@ def sample( trace.report._t_sampling = t_sampling if "variable_inclusion" in trace.stat_names: - variable_inclusion = trace.get_sampler_stats("variable_inclusion")[::-draws] - trace.report.variable_importance = np.mean([vi / vi.sum() for vi in variable_inclusion], 0) + variable_inclusion = np.vstack(trace.get_sampler_stats("variable_inclusion")) + variable_inclusion = np.split(variable_inclusion, 50) + dada = np.vstack([v.sum(0) / v.sum() for v in variable_inclusion]) + trace.report.variable_importance_m = dada.mean(0) + trace.report.variable_importance_s = dada.std(0) n_chains = len(trace.chains) _log.info( diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py index 8a2744ead2d..b53f4e8ecdb 100644 --- a/pymc3/step_methods/pgbart.py +++ b/pymc3/step_methods/pgbart.py @@ -60,7 +60,7 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m self.idx = 0 if chunk == "auto": self.chunk = max(1, int(self.bart.m * 0.1)) - self.variable_inclusion = np.zeros(self.bart.num_variates) + self.variable_inclusion = np.zeros(self.bart.num_variates, dtype="int") self.num_particles = num_particles self.log_num_particles = np.log(num_particles) self.indices = list(range(1, num_particles)) @@ -77,6 +77,7 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m def astep(self, _): bart = self.bart num_observations = bart.num_observations + variable_inclusion = self.variable_inclusion # For the tunning phase we restrict max_stages to a low number, otherwise it is almost sure # we will reach max_stages given that our first set of m trees is not good at all. @@ -141,10 +142,11 @@ def astep(self, _): bart.sum_trees_output = bart.Y - R_j + new_prediction if not self.tune: + variable_inclusion = self.variable_inclusion for index in new_tree.used_variates: - self.variable_inclusion[index] += 1 + variable_inclusion[index] += 1 - stats = {"variable_inclusion": self.variable_inclusion} + stats = {"variable_inclusion": variable_inclusion} return bart.sum_trees_output, [stats] From 78b6f79819f46f1c9f119c0ca17e19f9e9e649af Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Mon, 9 Nov 2020 18:37:33 -0300 Subject: [PATCH 21/24] update variable importance report --- pymc3/sampling.py | 10 ++++++---- pymc3/step_methods/pgbart.py | 4 +--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pymc3/sampling.py b/pymc3/sampling.py index a93a8aea079..1fdff6a3438 100644 --- a/pymc3/sampling.py +++ b/pymc3/sampling.py @@ -608,10 +608,12 @@ def sample( if "variable_inclusion" in trace.stat_names: variable_inclusion = np.vstack(trace.get_sampler_stats("variable_inclusion")) - variable_inclusion = np.split(variable_inclusion, 50) - dada = np.vstack([v.sum(0) / v.sum() for v in variable_inclusion]) - trace.report.variable_importance_m = dada.mean(0) - trace.report.variable_importance_s = dada.std(0) + variable_inclusion = np.vstack( + [v.sum(0) / v.sum() for v in np.array_split(variable_inclusion, 50)] + ) + trace.report.variable_importance = np.empty((variable_inclusion.shape[1], 3)) + trace.report.variable_importance[:, 0] = variable_inclusion.mean(0) + trace.report.variable_importance[:, 1:3] = arviz.hdi(variable_inclusion, hdi_prob=0.68) n_chains = len(trace.chains) _log.info( diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py index b53f4e8ecdb..e3ed5503bc9 100644 --- a/pymc3/step_methods/pgbart.py +++ b/pymc3/step_methods/pgbart.py @@ -60,7 +60,6 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m self.idx = 0 if chunk == "auto": self.chunk = max(1, int(self.bart.m * 0.1)) - self.variable_inclusion = np.zeros(self.bart.num_variates, dtype="int") self.num_particles = num_particles self.log_num_particles = np.log(num_particles) self.indices = list(range(1, num_particles)) @@ -77,7 +76,7 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m def astep(self, _): bart = self.bart num_observations = bart.num_observations - variable_inclusion = self.variable_inclusion + variable_inclusion = np.zeros(bart.num_variates, dtype="int") # For the tunning phase we restrict max_stages to a low number, otherwise it is almost sure # we will reach max_stages given that our first set of m trees is not good at all. @@ -142,7 +141,6 @@ def astep(self, _): bart.sum_trees_output = bart.Y - R_j + new_prediction if not self.tune: - variable_inclusion = self.variable_inclusion for index in new_tree.used_variates: variable_inclusion[index] += 1 From 2dda3b020ab4da3ab259ec1cb8136bef0e1abfc4 Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Sat, 14 Nov 2020 09:39:12 -0300 Subject: [PATCH 22/24] update release notes, remove vi hdi report --- RELEASE-NOTES.md | 3 ++- pymc3/sampling.py | 9 ++------- pymc3/step_methods/pgbart.py | 5 +++++ 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md index 1640bf52514..0eb41630a74 100644 --- a/RELEASE-NOTES.md +++ b/RELEASE-NOTES.md @@ -14,8 +14,9 @@ ### New features - `sample_posterior_predictive_w` can now feed on `xarray.Dataset` - e.g. from `InferenceData.posterior`. (see [#4042](https://github.com/pymc-devs/pymc3/pull/4042)) - Add MLDA, a new stepper for multilevel sampling. MLDA can be used when a hierarchy of approximate posteriors of varying accuracy is available, offering improved sampling efficiency especially in high-dimensional problems and/or where gradients are not available (see [#3926](https://github.com/pymc-devs/pymc3/pull/3926)) -- Change SMC metropolis kernel to independent metropolis kernel [#4115](https://github.com/pymc-devs/pymc3/pull/3926)) +- Change SMC metropolis kernel to independent metropolis kernel [#4115](https://github.com/pymc-devs/pymc3/pull/4115)) - Add alternative parametrization to NegativeBinomial distribution in terms of n and p (see [#4126](https://github.com/pymc-devs/pymc3/issues/4126)) +- Add Bayesian Additive Regression Trees (BARTs) [#4183](https://github.com/pymc-devs/pymc3/pull/4183)) ## PyMC3 3.9.3 (11 August 2020) diff --git a/pymc3/sampling.py b/pymc3/sampling.py index 1fdff6a3438..6a44f11cb94 100644 --- a/pymc3/sampling.py +++ b/pymc3/sampling.py @@ -607,13 +607,8 @@ def sample( trace.report._t_sampling = t_sampling if "variable_inclusion" in trace.stat_names: - variable_inclusion = np.vstack(trace.get_sampler_stats("variable_inclusion")) - variable_inclusion = np.vstack( - [v.sum(0) / v.sum() for v in np.array_split(variable_inclusion, 50)] - ) - trace.report.variable_importance = np.empty((variable_inclusion.shape[1], 3)) - trace.report.variable_importance[:, 0] = variable_inclusion.mean(0) - trace.report.variable_importance[:, 1:3] = arviz.hdi(variable_inclusion, hdi_prob=0.68) + variable_inclusion = np.stack(trace.get_sampler_stats("variable_inclusion")).mean(0) + trace.report.variable_importance = variable_inclusion / variable_inclusion.sum() n_chains = len(trace.chains) _log.info( diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py index e3ed5503bc9..1d6a503c4c7 100644 --- a/pymc3/step_methods/pgbart.py +++ b/pymc3/step_methods/pgbart.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging + import numpy as np from theano import function as theano_function @@ -21,6 +23,8 @@ from ..model import modelcontext from ..theanof import inputvars, make_shared_replacements, join_nonshared_inputs +_log = logging.getLogger("pymc3") + class PGBART(ArrayStepShared): """ @@ -52,6 +56,7 @@ class PGBART(ArrayStepShared): stats_dtypes = [{"variable_inclusion": np.ndarray}] def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", model=None): + _log.warning("The BART model is experimental. Use with caution.") model = modelcontext(model) vars = inputvars(vars) self.bart = vars[0].distribution From bb69a7642a697402f50ee30600d39a0abe2b5f98 Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Sat, 14 Nov 2020 12:26:50 -0300 Subject: [PATCH 23/24] test variable importance --- pymc3/tests/test_sampling.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pymc3/tests/test_sampling.py b/pymc3/tests/test_sampling.py index 389b3eb4520..cb7cb5ab88d 100644 --- a/pymc3/tests/test_sampling.py +++ b/pymc3/tests/test_sampling.py @@ -166,6 +166,20 @@ def test_trace_report(self, step_cls, discard): assert isinstance(trace.report.t_sampling, float) pass + def test_trace_report_bart(): + X = np.random.normal(0, 1, size=(3, 250)).T + Y = np.random.normal(0, 1, size=250) + X[:, 0] = np.random.normal(Y, 0.1) + + with pm.Model() as model: + mu = pm.BART("mu", X, Y, m=20) + sigma = pm.HalfNormal("sigma", 1) + y = pm.Normal("y", mu, sigma, observed=Y) + trace = pm.sample(500, tune=100, chains=1, random_seed=3415) + var_imp = trace.report.variable_importance + assert var_imp[0] > var_imp[1:].sum() + npt.assert_almost_equal(var_imp.sum(), 1) + def test_return_inferencedata(self): with self.model: kwargs = dict(draws=100, tune=50, cores=1, chains=2, step=pm.Metropolis()) From 5a7b55284d97b3b72db56eab80139fb814350c10 Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Sat, 14 Nov 2020 13:29:18 -0300 Subject: [PATCH 24/24] fix test --- pymc3/tests/test_sampling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pymc3/tests/test_sampling.py b/pymc3/tests/test_sampling.py index cb7cb5ab88d..a415f648ea4 100644 --- a/pymc3/tests/test_sampling.py +++ b/pymc3/tests/test_sampling.py @@ -166,7 +166,7 @@ def test_trace_report(self, step_cls, discard): assert isinstance(trace.report.t_sampling, float) pass - def test_trace_report_bart(): + def test_trace_report_bart(self): X = np.random.normal(0, 1, size=(3, 250)).T Y = np.random.normal(0, 1, size=250) X[:, 0] = np.random.normal(Y, 0.1) @@ -175,7 +175,7 @@ def test_trace_report_bart(): mu = pm.BART("mu", X, Y, m=20) sigma = pm.HalfNormal("sigma", 1) y = pm.Normal("y", mu, sigma, observed=Y) - trace = pm.sample(500, tune=100, chains=1, random_seed=3415) + trace = pm.sample(500, tune=100, random_seed=3415) var_imp = trace.report.variable_importance assert var_imp[0] > var_imp[1:].sum() npt.assert_almost_equal(var_imp.sum(), 1)