QuantEcon · mmcky · Apr 6, 2016 · Mar 18, 2016 · Mar 19, 2016 · Mar 19, 2016
diff --git a/quantecon/markov/__init__.py b/quantecon/markov/__init__.py
@@ -8,4 +8,4 @@
 from .random import random_markov_chain, random_stochastic_matrix, \
     random_discrete_dp
 from .approximation import tauchen
-from .ddp import DiscreteDP
+from .ddp import DiscreteDP, backward_induction
diff --git a/quantecon/markov/ddp.py b/quantecon/markov/ddp.py
@@ -40,8 +40,8 @@
 
 The *optimal value function* :math:`v^*` is the function such that
 :math:`v^*(s) = \max_{\sigma} v_{\sigma}(s)` for all :math:`s \in S`. A
-policy function :math:`\sigma^*` is *optimal* if :math:`v_{\sigma}(s) =
-v(s)` for all :math:`s \in S`.
+policy function :math:`\sigma^*` is *optimal* if :math:`v_{\sigma^*}(s)
+= v^*(s)` for all :math:`s \in S`.
 
 The *Bellman equation* is written as
 
@@ -109,11 +109,12 @@
 
 """
 from __future__ import division
+import warnings
 import numpy as np
 import scipy.sparse as sp
+from numba import jit
 
 from .core import MarkovChain
-from ..util import numba_installed, jit
 
 
 class DiscreteDP(object):
@@ -165,7 +166,7 @@ class DiscreteDP(object):
         Transition probability array.
 
     beta : scalar(float)
-        Discount factor. Must be 0 <= beta < 1.
+        Discount factor. Must be in [0, 1].
 
     s_indices : array_like(int, ndim=1), optional(default=None)
         Array containing the indices of the states.
@@ -190,6 +191,13 @@ class DiscreteDP(object):
     max_iter : scalar(int), default=250
         Default value for the maximum number of iterations.
 
+    Notes
+    -----
+    DiscreteDP accepts beta=1 for convenience. In this case, infinite
+    horizon solution methods are disabled, and the instance is then seen
+    as merely an object carrying the Bellman operator, which may be used
+    for backward induction for finite horizon problems.
+
     Examples
     --------
     Consider the following example, taken from Puterman (2005), Section
@@ -401,8 +409,12 @@ def s_wise_max(vals, out=None, out_argmax=None):
         # Check that for every state, at least one action is feasible
         self._check_action_feasibility()
 
-        if not (0 <= beta < 1):
-            raise ValueError('beta must be in [0, 1)')
+        if not (0 <= beta <= 1):
+            raise ValueError('beta must be in [0, 1]')
+        if beta == 1:
+            msg = 'infinite horizon solution methods are disabled with beta=1'
+            warnings.warn(msg)
+            self._error_msg_no_discounting = 'method invalid for beta=1'
         self.beta = beta
 
         self.epsilon = 1e-3
@@ -561,6 +573,9 @@ def evaluate_policy(self, sigma):
             Value vector of `sigma`, of length n.
 
         """
+        if self.beta == 1:
+            raise NotImplementedError(self._error_msg_no_discounting)
+
         # Solve (I - beta * Q_sigma) v = R_sigma for v
         R_sigma, Q_sigma = self.RQ_sigma(sigma)
         b = R_sigma
@@ -678,6 +693,9 @@ def value_iteration(self, v_init=None, epsilon=None, max_iter=None):
         `solve` method.
 
         """
+        if self.beta == 1:
+            raise NotImplementedError(self._error_msg_no_discounting)
+
         if max_iter is None:
             max_iter = self.max_iter
         if epsilon is None:
@@ -718,6 +736,9 @@ def policy_iteration(self, v_init=None, max_iter=None):
         `solve` method.
 
         """
+        if self.beta == 1:
+            raise NotImplementedError(self._error_msg_no_discounting)
+
         if max_iter is None:
             max_iter = self.max_iter
 
@@ -755,6 +776,9 @@ def modified_policy_iteration(self, v_init=None, epsilon=None,
         the `solve` method.
 
         """
+        if self.beta == 1:
+            raise NotImplementedError(self._error_msg_no_discounting)
+
         if max_iter is None:
             max_iter = self.max_iter
         if epsilon is None:
@@ -873,6 +897,73 @@ def __dir__(self):
         return self.keys()
 
 
+def backward_induction(ddp, T, v_term=None):
+    r"""
+    Solve by backward induction a :math:`T`-period finite horizon
+    discrete dynamic program with stationary reward and transition
+    probability functions :math:`r` and :math:`q` and discount factor
+    :math:`\beta \in [0, 1]`.
+
+    The optimal value functions :math:`v^*_0, \ldots, v^*_T` and policy
+    functions :math:`\sigma^*_0, \ldots, \sigma^*_{T-1}` are obtained by
+    :math:`v^*_T = v_T`, and
+
+    .. math::
+
+        v^*_{t-1}(s) = \max_{a \in A(s)} r(s, a) +
+            \beta \sum_{s' \in S} q(s'|s, a) v^*_t(s')
+            \quad (s \in S)
+
+    and
+
+    .. math::
+
+        \sigma^*_{t-1}(s) \in \operatorname*{arg\,max}_{a \in A(s)}
+            r(s, a) + \beta \sum_{s' \in S} q(s'|s, a) v^*_t(s')
+            \quad (s \in S)
+
+    for :math:`t = T, \ldots, 1`, where the terminal value function
+    :math:`v_T` is exogenously given.
+
+    Parameters
+    ----------
+    ddp : DiscreteDP
+        DiscreteDP instance storing reward array `R`, transition
+        probability array `Q`, and discount factor `beta`.
+
+    T : scalar(int)
+        Number of decision periods.
+
+    v_term : array_like(float, ndim=1), optional(default=None)
+        Terminal value function, of length equal to n (the number of
+        states). If None, it defaults to the vector of zeros.
+
+    Returns
+    -------
+    vs : ndarray(float, ndim=2)
+        Array of shape (T+1, n) where `vs[t]` contains the optimal
+        value function at period `t = 0, ..., T`.
+
+    sigmas : ndarray(int, ndim=2)
+        Array of shape (T, n) where `sigmas[t]` contains the optimal
+        policy function at period `t = 0, ..., T-1`.
+
+    """
+    n = ddp.num_states
+    vs = np.empty((T+1, n))
+    sigmas = np.empty((T, n), dtype=int)
+
+    if v_term is None:
+        v_term = np.zeros(n)
+    vs[T, :] = v_term
+
+    for t in range(T, 0, -1):
+        ddp.bellman_operator(vs[t, :], Tv=vs[t-1, :], sigma=sigmas[t-1, :])
+
+    return vs, sigmas
+
+
+@jit(nopython=True)
 def _s_wise_max_argmax(a_indices, a_indptr, vals, out_max, out_argmax):
     n = len(out_max)
     for i in range(n):
@@ -884,10 +975,8 @@ def _s_wise_max_argmax(a_indices, a_indptr, vals, out_max, out_argmax):
             out_max[i] = vals[m]
             out_argmax[i] = a_indices[m]
 
-if numba_installed:
-    _s_wise_max_argmax = jit(nopython=True)(_s_wise_max_argmax)
-
 
+@jit(nopython=True)
 def _s_wise_max(a_indices, a_indptr, vals, out_max):
     n = len(out_max)
     for i in range(n):
@@ -898,20 +987,15 @@ def _s_wise_max(a_indices, a_indptr, vals, out_max):
                     m = j
             out_max[i] = vals[m]
 
-if numba_installed:
-    _s_wise_max = jit(nopython=True)(_s_wise_max)
-
 
+@jit(nopython=True)
 def _find_indices(a_indices, a_indptr, sigma, out):
     n = len(sigma)
     for i in range(n):
         for j in range(a_indptr[i], a_indptr[i+1]):
             if sigma[i] == a_indices[j]:
                 out[i] = j
 
-if numba_installed:
-    _find_indices = jit(nopython=True)(_find_indices)
-
 
 @jit(nopython=True)
 def _has_sorted_sa_indices(s_indices, a_indices):

diff --git a/quantecon/markov/tests/test_ddp.py b/quantecon/markov/tests/test_ddp.py
@@ -12,7 +12,7 @@
 from numpy.testing import assert_array_equal, assert_allclose, assert_raises
 from nose.tools import eq_, ok_
 
-from quantecon.markov import DiscreteDP
+from quantecon.markov import DiscreteDP, backward_induction
 
 
 class TestDiscreteDP:
@@ -177,6 +177,43 @@ def test_ddp_sorting():
         assert_array_equal(ddp_Q, Q)
 
 
+class TestFiniteHorizon:
+    def setUp(self):
+        # From Puterman 2005, Section 3.2, Section 4.6.1
+        # "single-product stochastic inventory control"
+        s_indices = [0, 0, 0, 0, 1, 1, 1, 2, 2, 3]
+        a_indices = [0, 1, 2, 3, 0, 1, 2, 0, 1, 0]
+        R = [ 0., -1., -2., -5.,  5.,  0., -3.,  6., -1.,  5.]
+        Q = [[ 1.  ,  0.  ,  0.  ,  0.  ],
+             [ 0.75,  0.25,  0.  ,  0.  ],
+             [ 0.25,  0.5 ,  0.25,  0.  ],
+             [ 0.  ,  0.25,  0.5 ,  0.25],
+             [ 0.75,  0.25,  0.  ,  0.  ],
+             [ 0.25,  0.5 ,  0.25,  0.  ],
+             [ 0.  ,  0.25,  0.5 ,  0.25],
+             [ 0.25,  0.5 ,  0.25,  0.  ],
+             [ 0.  ,  0.25,  0.5 ,  0.25],
+             [ 0.  ,  0.25,  0.5 ,  0.25]]
+        beta = 1
+        self.ddp = DiscreteDP(R, Q, beta, s_indices, a_indices)
+
+    def test_backward_induction(self):
+        T = 3
+        # v_T = np.zeors(self.ddp.n)
+        vs_expected = [[67/16, 129/16, 194/16, 227/16],
+                       [2, 25/4, 10, 21/2],
+                       [0, 5, 6, 5],
+                       [0, 0, 0, 0]]
+        sigmas_expected = [[3, 0, 0, 0],
+                           [2, 0, 0, 0],
+                           [0, 0, 0, 0]]
+
+        vs, sigmas = backward_induction(self.ddp, T)
+
+        assert_allclose(vs, vs_expected)
+        assert_array_equal(sigmas, sigmas_expected)
+
+
 def test_ddp_negative_inf_error():
     n, m = 3, 2
     R = np.array([[0, 1], [0, -np.inf], [-np.inf, -np.inf]])
@@ -213,6 +250,30 @@ def test_ddp_no_feasibile_action_error():
     assert_raises(ValueError, DiscreteDP, R, Q, beta, s_indices, a_indices)
 
 
+def test_ddp_beta_1_not_implemented_error():
+    n, m = 3, 2
+    R = np.array([[0, 1], [1, 0], [0, 1]])
+    Q = np.empty((n, m, n))
+    Q[:] = 1/n
+    beta = 1
+
+    ddp0 = DiscreteDP(R, Q, beta)
+    s_indices, a_indices = np.where(R > -np.inf)
+    R_sa = R[s_indices, a_indices]
+    Q_sa = Q[s_indices, a_indices]
+    ddp1 = DiscreteDP(R_sa, Q_sa, beta, s_indices, a_indices)
+    Q_sa_sp = sparse.csr_matrix(Q_sa)
+    ddp2 = DiscreteDP(R_sa, Q_sa_sp, beta, s_indices, a_indices)
+
+    solution_methods = \
+        ['value_iteration', 'policy_iteration', 'modified_policy_iteration']
+
+    for ddp in [ddp0, ddp1, ddp2]:
+        assert_raises(NotImplementedError, ddp.evaluate_policy, np.zeros(n))
+        for method in solution_methods:
+            assert_raises(NotImplementedError, getattr(ddp, method))
+
+
 if __name__ == '__main__':
     import sys
     import nose