Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DiscreteDP: Allow beta=1 #244

Merged
merged 9 commits into from
Apr 6, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion quantecon/markov/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
from .random import random_markov_chain, random_stochastic_matrix, \
random_discrete_dp
from .approximation import tauchen
from .ddp import DiscreteDP
from .ddp import DiscreteDP, backward_induction
114 changes: 99 additions & 15 deletions quantecon/markov/ddp.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@

The *optimal value function* :math:`v^*` is the function such that
:math:`v^*(s) = \max_{\sigma} v_{\sigma}(s)` for all :math:`s \in S`. A
policy function :math:`\sigma^*` is *optimal* if :math:`v_{\sigma}(s) =
v(s)` for all :math:`s \in S`.
policy function :math:`\sigma^*` is *optimal* if :math:`v_{\sigma^*}(s)
= v^*(s)` for all :math:`s \in S`.

The *Bellman equation* is written as

Expand Down Expand Up @@ -109,11 +109,12 @@

"""
from __future__ import division
import warnings
import numpy as np
import scipy.sparse as sp
from numba import jit

from .core import MarkovChain
from ..util import numba_installed, jit


class DiscreteDP(object):
Expand Down Expand Up @@ -165,7 +166,7 @@ class DiscreteDP(object):
Transition probability array.

beta : scalar(float)
Discount factor. Must be 0 <= beta < 1.
Discount factor. Must be in [0, 1].

s_indices : array_like(int, ndim=1), optional(default=None)
Array containing the indices of the states.
Expand All @@ -190,6 +191,13 @@ class DiscreteDP(object):
max_iter : scalar(int), default=250
Default value for the maximum number of iterations.

Notes
-----
DiscreteDP accepts beta=1 for convenience. In this case, infinite
horizon solution methods are disabled, and the instance is then seen
as merely an object carrying the Bellman operator, which may be used
for backward induction for finite horizon problems.

Examples
--------
Consider the following example, taken from Puterman (2005), Section
Expand Down Expand Up @@ -401,8 +409,12 @@ def s_wise_max(vals, out=None, out_argmax=None):
# Check that for every state, at least one action is feasible
self._check_action_feasibility()

if not (0 <= beta < 1):
raise ValueError('beta must be in [0, 1)')
if not (0 <= beta <= 1):
raise ValueError('beta must be in [0, 1]')
if beta == 1:
msg = 'infinite horizon solution methods are disabled with beta=1'
warnings.warn(msg)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This warning is currently being ignored. I suspect this is because we have turned off warnings somewhere else, but thought I'd point it out.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this will be resolved by #231.

self._error_msg_no_discounting = 'method invalid for beta=1'
self.beta = beta

self.epsilon = 1e-3
Expand Down Expand Up @@ -561,6 +573,9 @@ def evaluate_policy(self, sigma):
Value vector of `sigma`, of length n.

"""
if self.beta == 1:
raise NotImplementedError(self._error_msg_no_discounting)

# Solve (I - beta * Q_sigma) v = R_sigma for v
R_sigma, Q_sigma = self.RQ_sigma(sigma)
b = R_sigma
Expand Down Expand Up @@ -678,6 +693,9 @@ def value_iteration(self, v_init=None, epsilon=None, max_iter=None):
`solve` method.

"""
if self.beta == 1:
raise NotImplementedError(self._error_msg_no_discounting)

if max_iter is None:
max_iter = self.max_iter
if epsilon is None:
Expand Down Expand Up @@ -718,6 +736,9 @@ def policy_iteration(self, v_init=None, max_iter=None):
`solve` method.

"""
if self.beta == 1:
raise NotImplementedError(self._error_msg_no_discounting)

if max_iter is None:
max_iter = self.max_iter

Expand Down Expand Up @@ -755,6 +776,9 @@ def modified_policy_iteration(self, v_init=None, epsilon=None,
the `solve` method.

"""
if self.beta == 1:
raise NotImplementedError(self._error_msg_no_discounting)

if max_iter is None:
max_iter = self.max_iter
if epsilon is None:
Expand Down Expand Up @@ -873,6 +897,73 @@ def __dir__(self):
return self.keys()


def backward_induction(ddp, T, v_term=None):
r"""
Solve by backward induction a :math:`T`-period finite horizon
discrete dynamic program with stationary reward and transition
probability functions :math:`r` and :math:`q` and discount factor
:math:`\beta \in [0, 1]`.

The optimal value functions :math:`v^*_0, \ldots, v^*_T` and policy
functions :math:`\sigma^*_0, \ldots, \sigma^*_{T-1}` are obtained by
:math:`v^*_T = v_T`, and

.. math::

v^*_{t-1}(s) = \max_{a \in A(s)} r(s, a) +
\beta \sum_{s' \in S} q(s'|s, a) v^*_t(s')
\quad (s \in S)

and

.. math::

\sigma^*_{t-1}(s) \in \operatorname*{arg\,max}_{a \in A(s)}
r(s, a) + \beta \sum_{s' \in S} q(s'|s, a) v^*_t(s')
\quad (s \in S)

for :math:`t = T, \ldots, 1`, where the terminal value function
:math:`v_T` is exogenously given.

Parameters
----------
ddp : DiscreteDP
DiscreteDP instance storing reward array `R`, transition
probability array `Q`, and discount factor `beta`.

T : scalar(int)
Number of decision periods.

v_term : array_like(float, ndim=1), optional(default=None)
Terminal value function, of length equal to n (the number of
states). If None, it defaults to the vector of zeros.

Returns
-------
vs : ndarray(float, ndim=2)
Array of shape (T+1, n) where `vs[t]` contains the optimal
value function at period `t = 0, ..., T`.

sigmas : ndarray(int, ndim=2)
Array of shape (T, n) where `sigmas[t]` contains the optimal
policy function at period `t = 0, ..., T-1`.

"""
n = ddp.num_states
vs = np.empty((T+1, n))
sigmas = np.empty((T, n), dtype=int)

if v_term is None:
v_term = np.zeros(n)
vs[T, :] = v_term

for t in range(T, 0, -1):
ddp.bellman_operator(vs[t, :], Tv=vs[t-1, :], sigma=sigmas[t-1, :])

return vs, sigmas


@jit(nopython=True)
def _s_wise_max_argmax(a_indices, a_indptr, vals, out_max, out_argmax):
n = len(out_max)
for i in range(n):
Expand All @@ -884,10 +975,8 @@ def _s_wise_max_argmax(a_indices, a_indptr, vals, out_max, out_argmax):
out_max[i] = vals[m]
out_argmax[i] = a_indices[m]

if numba_installed:
_s_wise_max_argmax = jit(nopython=True)(_s_wise_max_argmax)


@jit(nopython=True)
def _s_wise_max(a_indices, a_indptr, vals, out_max):
n = len(out_max)
for i in range(n):
Expand All @@ -898,20 +987,15 @@ def _s_wise_max(a_indices, a_indptr, vals, out_max):
m = j
out_max[i] = vals[m]

if numba_installed:
_s_wise_max = jit(nopython=True)(_s_wise_max)


@jit(nopython=True)
def _find_indices(a_indices, a_indptr, sigma, out):
n = len(sigma)
for i in range(n):
for j in range(a_indptr[i], a_indptr[i+1]):
if sigma[i] == a_indices[j]:
out[i] = j

if numba_installed:
_find_indices = jit(nopython=True)(_find_indices)


@jit(nopython=True)
def _has_sorted_sa_indices(s_indices, a_indices):
Expand Down
63 changes: 62 additions & 1 deletion quantecon/markov/tests/test_ddp.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from numpy.testing import assert_array_equal, assert_allclose, assert_raises
from nose.tools import eq_, ok_

from quantecon.markov import DiscreteDP
from quantecon.markov import DiscreteDP, backward_induction


class TestDiscreteDP:
Expand Down Expand Up @@ -177,6 +177,43 @@ def test_ddp_sorting():
assert_array_equal(ddp_Q, Q)


class TestFiniteHorizon:
def setUp(self):
# From Puterman 2005, Section 3.2, Section 4.6.1
# "single-product stochastic inventory control"
s_indices = [0, 0, 0, 0, 1, 1, 1, 2, 2, 3]
a_indices = [0, 1, 2, 3, 0, 1, 2, 0, 1, 0]
R = [ 0., -1., -2., -5., 5., 0., -3., 6., -1., 5.]
Q = [[ 1. , 0. , 0. , 0. ],
[ 0.75, 0.25, 0. , 0. ],
[ 0.25, 0.5 , 0.25, 0. ],
[ 0. , 0.25, 0.5 , 0.25],
[ 0.75, 0.25, 0. , 0. ],
[ 0.25, 0.5 , 0.25, 0. ],
[ 0. , 0.25, 0.5 , 0.25],
[ 0.25, 0.5 , 0.25, 0. ],
[ 0. , 0.25, 0.5 , 0.25],
[ 0. , 0.25, 0.5 , 0.25]]
beta = 1
self.ddp = DiscreteDP(R, Q, beta, s_indices, a_indices)

def test_backward_induction(self):
T = 3
# v_T = np.zeors(self.ddp.n)
vs_expected = [[67/16, 129/16, 194/16, 227/16],
[2, 25/4, 10, 21/2],
[0, 5, 6, 5],
[0, 0, 0, 0]]
sigmas_expected = [[3, 0, 0, 0],
[2, 0, 0, 0],
[0, 0, 0, 0]]

vs, sigmas = backward_induction(self.ddp, T)

assert_allclose(vs, vs_expected)
assert_array_equal(sigmas, sigmas_expected)


def test_ddp_negative_inf_error():
n, m = 3, 2
R = np.array([[0, 1], [0, -np.inf], [-np.inf, -np.inf]])
Expand Down Expand Up @@ -213,6 +250,30 @@ def test_ddp_no_feasibile_action_error():
assert_raises(ValueError, DiscreteDP, R, Q, beta, s_indices, a_indices)


def test_ddp_beta_1_not_implemented_error():
n, m = 3, 2
R = np.array([[0, 1], [1, 0], [0, 1]])
Q = np.empty((n, m, n))
Q[:] = 1/n
beta = 1

ddp0 = DiscreteDP(R, Q, beta)
s_indices, a_indices = np.where(R > -np.inf)
R_sa = R[s_indices, a_indices]
Q_sa = Q[s_indices, a_indices]
ddp1 = DiscreteDP(R_sa, Q_sa, beta, s_indices, a_indices)
Q_sa_sp = sparse.csr_matrix(Q_sa)
ddp2 = DiscreteDP(R_sa, Q_sa_sp, beta, s_indices, a_indices)

solution_methods = \
['value_iteration', 'policy_iteration', 'modified_policy_iteration']

for ddp in [ddp0, ddp1, ddp2]:
assert_raises(NotImplementedError, ddp.evaluate_policy, np.zeros(n))
for method in solution_methods:
assert_raises(NotImplementedError, getattr(ddp, method))


if __name__ == '__main__':
import sys
import nose
Expand Down