diff --git a/tests/tensor/random/test_opt.py b/tests/tensor/random/test_opt.py index d51e39e919..16e45028c9 100644 --- a/tests/tensor/random/test_opt.py +++ b/tests/tensor/random/test_opt.py @@ -10,8 +10,20 @@ from theano.gof.opt import EquilibriumOptimizer from theano.gof.optdb import Query from theano.tensor.elemwise import DimShuffle -from theano.tensor.random.basic import dirichlet, multivariate_normal, normal, poisson -from theano.tensor.random.opt import lift_rv_shapes, local_dimshuffle_rv_lift +from theano.tensor.random.basic import ( + dirichlet, + multivariate_normal, + normal, + poisson, + uniform, +) +from theano.tensor.random.op import RandomVariable +from theano.tensor.random.opt import ( + lift_rv_shapes, + local_dimshuffle_rv_lift, + local_subtensor_rv_lift, +) +from theano.tensor.subtensor import AdvancedSubtensor, AdvancedSubtensor1, Subtensor inplace_mode = Mode("py", Query(include=["random_make_inplace"], exclude=[])) @@ -284,6 +296,187 @@ def test_DimShuffle_lift(ds_order, lifted, dist_op, dist_params, size, rtol): np.testing.assert_allclose(res_base, res_opt, rtol=rtol) +@pytest.mark.parametrize( + "indices, lifted, dist_op, dist_params, size", + [ + ( + # `size`-less advanced boolean indexing + (np.r_[True, False, False, True],), + True, + uniform, + ( + (0.1 - 1e-5) * np.arange(4).astype(dtype=config.floatX), + 0.1 * np.arange(4).astype(dtype=config.floatX), + ), + (), + ), + ( + # `size`-only advanced boolean indexing + (np.r_[True, False, False, True],), + True, + uniform, + ( + np.array(0.9 - 1e-5, dtype=config.floatX), + np.array(0.9, dtype=config.floatX), + ), + (4,), + ), + ( + # `size`-only slice + (slice(4, -6, -1),), + True, + uniform, + ( + np.array(0.9 - 1e-5, dtype=config.floatX), + np.array(0.9, dtype=config.floatX), + ), + (5, 2), + ), + ( + (slice(1, None), [0, 2]), + True, + normal, + ( + np.array([1, 10, 100], dtype=config.floatX), + np.array([1e-5, 2e-5, 3e-5], dtype=config.floatX), + ), + (4, 3), + ), + ( + (np.array([1]), 0), + True, + normal, + ( + np.array([[-1, 20], [300, -4000]], dtype=config.floatX), + np.array([[1e-6, 2e-6]], dtype=config.floatX), + ), + (3, 2, 2), + ), + # A multi-dimensional case + ( + (np.array([1]), 0), + False, + multivariate_normal, + ( + np.array([[-1, 20], [300, -4000]], dtype=config.floatX), + np.eye(2).astype(config.floatX) * 1e-6, + ), + (), + ), + # Only one distribution parameter + ( + (0,), + True, + poisson, + (np.array([[1, 2], [3, 4]], dtype=config.floatX),), + (3, 2, 2), + ), + ], +) +@change_flags(compute_test_value_opt="raise", compute_test_value="raise") +def test_Subtensor_lift(indices, lifted, dist_op, dist_params, size): + + rng = shared(np.random.RandomState(1233532), borrow=False) + + dist_params_tt = [] + for p in dist_params: + p_tt = tt.as_tensor(p).type() + p_tt.tag.test_value = p + dist_params_tt.append(p_tt) + + size_tt = [] + for s in size: + s_tt = tt.iscalar() + s_tt.tag.test_value = s + size_tt.append(s_tt) + + from theano.tensor.subtensor import as_index_constant + + indices_tt = () + for i in indices: + i_tt = as_index_constant(i) + if not isinstance(i_tt, slice): + i_tt.tag.test_value = i + indices_tt += (i_tt,) + + dist_st = dist_op(*dist_params_tt, size=size_tt, rng=rng)[indices_tt] + + f_inputs = [ + p + for p in dist_params_tt + size_tt + list(indices_tt) + if not isinstance(p, (slice, Constant)) + ] + + mode = Mode( + "py", EquilibriumOptimizer([local_subtensor_rv_lift], max_use_ratio=100) + ) + + f_opt = function( + f_inputs, + dist_st, + mode=mode, + ) + + (new_out,) = f_opt.maker.fgraph.outputs + + if lifted: + assert isinstance(new_out.owner.op, RandomVariable) + assert all( + isinstance(i.owner.op, (AdvancedSubtensor, AdvancedSubtensor1, Subtensor)) + for i in new_out.owner.inputs[3:] + if i.owner + ) + else: + assert isinstance( + new_out.owner.op, (AdvancedSubtensor, AdvancedSubtensor1, Subtensor) + ) + return + + f_base = function( + f_inputs, + dist_st, + mode=no_mode, + ) + + arg_values = [p.get_test_value() for p in f_inputs] + res_base = f_base(*arg_values) + res_opt = f_opt(*arg_values) + + np.testing.assert_allclose(res_base, res_opt, rtol=1e-3) + + +def test_Subtensor_lift_restrictions(): + rng = shared(np.random.RandomState(1233532), borrow=False) + + std = tt.vector("std") + std.tag.test_value = np.array([1e-5, 2e-5, 3e-5], dtype=config.floatX) + x = normal(tt.arange(2), tt.ones(2), rng=rng) + y = x[1] + # The non-`Subtensor` client depends on the RNG state, so we can't perform + # the lift + z = x - y + + fg = FunctionGraph([rng], [z], clone=False) + _ = EquilibriumOptimizer([local_subtensor_rv_lift], max_use_ratio=100).apply(fg) + + subtensor_node = fg.outputs[0].owner.inputs[1].owner.inputs[0].owner + assert subtensor_node == y.owner + assert isinstance(subtensor_node.op, Subtensor) + assert subtensor_node.inputs[0].owner.op == normal + + # The non-`Subtensor` client doesn't depend on the RNG state, so we can + # perform the lift + z = tt.ones(x.shape) - x[1] + + fg = FunctionGraph([rng], [z], clone=False) + EquilibriumOptimizer([local_subtensor_rv_lift], max_use_ratio=100).apply(fg) + + rv_node = fg.outputs[0].owner.inputs[1].owner.inputs[0].owner + assert rv_node.op == normal + assert isinstance(rv_node.inputs[-1].owner.op, Subtensor) + assert isinstance(rv_node.inputs[-2].owner.op, Subtensor) + + def test_Dimshuffle_lift_restrictions(): rng = shared(np.random.RandomState(1233532), borrow=False) diff --git a/theano/tensor/random/opt.py b/theano/tensor/random/opt.py index 26d2b763de..5bdcbfb90b 100644 --- a/theano/tensor/random/opt.py +++ b/theano/tensor/random/opt.py @@ -9,6 +9,14 @@ from theano.tensor.opt import in2out from theano.tensor.random.op import RandomVariable from theano.tensor.random.utils import broadcast_params +from theano.tensor.subtensor import ( + AdvancedSubtensor, + AdvancedSubtensor1, + Subtensor, + as_index_variable, + get_idx_list, + indexed_result_shape, +) @local_optimizer([RandomVariable]) @@ -197,3 +205,152 @@ def local_dimshuffle_rv_lift(fgraph, node): return [new_node.outputs[1]] return False + + +@local_optimizer([Subtensor, AdvancedSubtensor1, AdvancedSubtensor]) +def local_subtensor_rv_lift(fgraph, node): + """Lift ``*Subtensor`` `Op`s up to a `RandomVariable`'s parameters. + + In a fashion similar to `local_dimshuffle_rv_lift`, the indexed dimensions + need to be separated into distinct replication-space and (independent) + parameter-space ``*Subtensor``s. + + The replication-space ``*Subtensor`` can be used to determine a + sub/super-set of the replication-space and, thus, a "smaller"/"larger" + ``size`` tuple. The parameter-space ``*Subtensor`` is simply lifted and + applied to the `RandomVariable`'s distribution parameters. + + Consider the following example graph: + ``normal(mu, std, size=(d1, d2, d3))[idx1, idx2, idx3]``. The + ``*Subtensor`` `Op` requests indices ``idx1``, ``idx2``, and ``idx3``, + which correspond to all three ``size`` dimensions. Now, depending on the + broadcasted dimensions of ``mu`` and ``std``, this ``*Subtensor`` `Op` + could be reducing the ``size`` parameter and/or subsetting the independent + ``mu`` and ``std`` parameters. Only once the dimensions are properly + separated into the two replication/parameter subspaces can we determine how + the ``*Subtensor`` indices are distributed. + For instance, ``normal(mu, std, size=(d1, d2, d3))[idx1, idx2, idx3]`` + could become ``normal(mu[idx1], std[idx2], size=np.shape(idx1) + np.shape(idx2) + np.shape(idx3))`` + if ``mu.shape == std.shape == ()`` + + ``normal`` is a rather simple case, because it's univariate. Multivariate + cases require a mapping between the parameter space and the image of the + random variable. This may not always be possible, but for many common + distributions it is. For example, the dimensions of the multivariate + normal's image can be mapped directly to each dimension of its parameters. + We use these mappings to change a graph like ``multivariate_normal(mu, Sigma)[idx1]`` + into ``multivariate_normal(mu[idx1], Sigma[idx1, idx1])``. Notice how + + Also, there's the important matter of "advanced" indexing, which may not + only subset an array, but also broadcast it to a larger size. + + """ + + st_op = node.op + + if not isinstance(st_op, (AdvancedSubtensor, AdvancedSubtensor1, Subtensor)): + return False + + base_rv = node.inputs[0] + + rv_node = base_rv.owner + if not (rv_node and isinstance(rv_node.op, RandomVariable)): + return False + + # If no one else is using the underlying `RandomVariable`, then we can + # do this; otherwise, the graph would be internally inconsistent. + if not all( + (n == node or isinstance(n.op, Shape)) for n, i in fgraph.clients[base_rv] + ): + return False + + rv_op = rv_node.op + rng, size, dtype, *dist_params = rv_node.inputs + + # TODO: Remove this once the multi-dimensional changes described below are + # in place. + if rv_op.ndim_supp > 0: + return False + + rv_op = base_rv.owner.op + rng, size, dtype, *dist_params = base_rv.owner.inputs + + idx_list = getattr(st_op, "idx_list", None) + if idx_list: + cdata = get_idx_list(node.inputs, idx_list) + else: + cdata = node.inputs[1:] + + st_indices, st_is_bool = zip( + *tuple( + (as_index_variable(i), getattr(i, "dtype", None) == "bool") for i in cdata + ) + ) + + # We need to separate dimensions into replications and independents + num_ind_dims = None + if len(dist_params) == 1: + num_ind_dims = dist_params[0].ndim + else: + # When there is more than one distribution parameter, assume that all + # of them will broadcast to the maximum number of dimensions + num_ind_dims = max(d.ndim for d in dist_params) + + reps_ind_split_idx = base_rv.ndim - (num_ind_dims + rv_op.ndim_supp) + + if len(st_indices) > reps_ind_split_idx: + # These are the indices that need to be applied to the parameters + ind_indices = tuple(st_indices[reps_ind_split_idx:]) + + # We need to broadcast the parameters before applying the `*Subtensor*` + # with these indices, because the indices could be referencing broadcast + # dimensions that don't exist (yet) + bcast_dist_params = broadcast_params(dist_params, rv_op.ndims_params) + + # TODO: For multidimensional distributions, we need a map that tells us + # which dimensions of the parameters need to be indexed. + # + # For example, `multivariate_normal` would have the following: + # `RandomVariable.param_to_image_dims = ((0,), (0, 1))` + # + # I.e. the first parameter's (i.e. mean's) first dimension maps directly to + # the dimension of the RV's image, and its second parameter's + # (i.e. covariance's) first and second dimensions map directly to the + # dimension of the RV's image. + + args_lifted = tuple(p[ind_indices] for p in bcast_dist_params) + else: + # In this case, no indexing is applied to the parameters; only the + # `size` parameter is affected. + args_lifted = dist_params + + # TODO: Could use `ShapeFeature` info. We would need to be sure that + # `node` isn't in the results, though. + # if hasattr(fgraph, "shape_feature"): + # output_shape = fgraph.shape_feature.shape_of(node.outputs[0]) + # else: + output_shape = indexed_result_shape(base_rv.shape, st_indices) + + size_lifted = ( + output_shape if rv_op.ndim_supp == 0 else output_shape[: -rv_op.ndim_supp] + ) + + # Boolean indices can actually change the `size` value (compared to just + # *which* dimensions of `size` are used). + if any(st_is_bool): + size_lifted = tuple( + tt.sum(idx) if is_bool else s + for s, is_bool, idx in zip( + size_lifted, st_is_bool, st_indices[: (reps_ind_split_idx + 1)] + ) + ) + + new_node = rv_op.make_node(rng, size_lifted, dtype, *args_lifted) + _, new_rv = new_node.outputs + + # Calling `Op.make_node` directly circumvents test value computations, so + # we need to compute the test values manually + if config.compute_test_value != "off": + compute_test_value(new_node) + + return [new_rv]