\n",
+ "text/plain": "Sampling 4 chains, 0 divergences \u001b[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[36m0:00:00\u001b[0m / \u001b[33m0:02:55\u001b[0m\n"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "tabbable": null,
+ "tooltip": null
+ }
+ }
+ },
+ "version_major": 2,
+ "version_minor": 0
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file
diff --git a/.jupyter_cache/global.db b/.jupyter_cache/global.db
index c7ca428..0496385 100644
Binary files a/.jupyter_cache/global.db and b/.jupyter_cache/global.db differ
diff --git a/cjs.qmd b/cjs.qmd
index e8aee74..d1479fd 100644
--- a/cjs.qmd
+++ b/cjs.qmd
@@ -30,18 +30,25 @@ The goal of the CJS model is to estimate survival, accounting for capture probab
: An example of the M-array from a four year capture-recapture survey. The number recaptured, $m_{i,j}$ refers to the number of individuals released at $i$ who were first recaptured at time $j$.
```{python}
+
+from pymc.distributions.dist_math import factln
+from scipy.linalg import circulant
+
+import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import arviz as az
import pymc as pm
import pytensor.tensor as pt
-from pymc.distributions.dist_math import factln
-from scipy.linalg import circulant
-
plt.style.use('fivethirtyeight')
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['figure.facecolor'] = 'white'
+plt.rcParams['axes.spines.left'] = False
+plt.rcParams['axes.spines.right'] = False
+plt.rcParams['axes.spines.top'] = False
+plt.rcParams['axes.spines.bottom'] = False
+sns.set_palette("tab10")
def create_recapture_array(history):
"""Create the recapture array from a capture history."""
diff --git a/closed_cmr.qmd b/closed_cmr.qmd
index b521a45..7f04cf1 100644
--- a/closed_cmr.qmd
+++ b/closed_cmr.qmd
@@ -32,17 +32,25 @@ This produces a capture history for each individual, which allows us to estimate
```{python}
# libraries
+from pymc.distributions.dist_math import binomln, logpow
+
+import vapeplot
+import seaborn as sns
import numpy as np
import pandas as pd
import pymc as pm
import arviz as az
import matplotlib.pyplot as plt
-from pymc.distributions.dist_math import binomln, logpow
# plotting parameters
plt.style.use('fivethirtyeight')
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['figure.facecolor'] = 'white'
+plt.rcParams['axes.spines.left'] = False
+plt.rcParams['axes.spines.right'] = False
+plt.rcParams['axes.spines.top'] = False
+plt.rcParams['axes.spines.bottom'] = False
+sns.set_palette("tab10")
# hyperparameters
SEED = 808
@@ -126,7 +134,7 @@ coords = {'species': ['pygmy', 'red_cheeked']}
with pm.Model(coords=coords) as M0:
# priors for the capture and inclusion probabilities
- psi = pm.Uniform('psi', 0, 1, dims='species')
+ psi = pm.Beta('psi', 0.001, 1, dims='species')
p = pm.Uniform('p', 0, 1, dims='species')
# likelihood for the summarized data
@@ -280,7 +288,7 @@ coords = {'alpha_coeffs': ['Intercept', 'B_Response']}
with pm.Model(coords=coords) as mb:
# priors for the capture and inclusion probabilities
- psi = pm.Uniform('psi', 0, 1)
+ psi = pm.Beta('psi', 0.001, 1)
Alpha = pm.Normal('Alpha', 0, 2, dims='alpha_coeffs')
nu = pm.math.dot(X, Alpha)
diff --git a/distance.qmd b/distance.qmd
index b298ddb..3ebaf39 100644
--- a/distance.qmd
+++ b/distance.qmd
@@ -20,6 +20,8 @@ Following @hooten2019, Chapter 24 and @royle2008, Chapter 7, I use the impala da
```{python}
#| fig-cap: Histogram of the number of detected impalas at varying distances.
#| label: fig-hist
+
+import seaborn as sns
import pymc as pm
import pytensor.tensor as pt
import matplotlib.pyplot as plt
@@ -30,6 +32,11 @@ import numpy as np
plt.style.use('fivethirtyeight')
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['figure.facecolor'] = 'white'
+plt.rcParams['axes.spines.left'] = False
+plt.rcParams['axes.spines.right'] = False
+plt.rcParams['axes.spines.top'] = False
+plt.rcParams['axes.spines.bottom'] = False
+sns.set_palette("tab10")
# hyper parameters
M = 500
@@ -84,7 +91,7 @@ The issue is that $x$ is unobserved for the undetected individuals. To work arou
with pm.Model() as distance:
- psi = pm.Uniform('psi', 0, 1)
+ psi = pm.Beta('psi', 0.001, 1)
sigma = pm.Uniform('sigma', 0, U_SIGMA)
x_unobserved = pm.Uniform('x_unobserved', 0, U_X, shape=unobserved_count)
@@ -145,15 +152,15 @@ ax1.hist(sigma_samples, edgecolor='white', bins=30)
# ax0.set_xlim((100, M))
# axes labels
-ax0.set_xlabel('Abundance $N$')
+ax0.set_xlabel(r'Abundance $N$')
ax0.set_ylabel('Number of samples')
-ax1.set_xlabel('Detection range $\sigma$')
+ax1.set_xlabel(r'Detection range $\sigma$')
# add the point estimates
N_hat = N_samples.mean()
sigma_hat = sigma_samples.mean()
-ax0.text(200, 350, f'$\hat{{N}}$={N_hat:.1f}', ha='left', va='center')
-ax1.text(205, 350, f'$\hat{{\sigma}}$={sigma_hat:.1f}', ha='left', va='center')
+ax0.text(200, 350, rf'$\hat{{N}}$={N_hat:.1f}', ha='left', va='center')
+ax1.text(205, 350, rf'$\hat{{\sigma}}$={sigma_hat:.1f}', ha='left', va='center')
# the results from royle and dorazio (2008) for comparison
N_hat_royle = 179.9
diff --git a/docs/cjs.html b/docs/cjs.html
index 8c01d39..9b01364 100644
--- a/docs/cjs.html
+++ b/docs/cjs.html
@@ -128,6 +128,7 @@
+
@@ -404,72 +405,78 @@
Cormack-Jolly-Seber
-
-
import numpy as np
-import matplotlib.pyplot as plt
-import arviz as az
-import pymc as pm
-import pytensor.tensor as pt
-
-from pymc.distributions.dist_math import factln
-from scipy.linalg import circulant
-
-plt.style.use('fivethirtyeight')
-plt.rcParams['axes.facecolor'] ='white'
-plt.rcParams['figure.facecolor'] ='white'
-
-def create_recapture_array(history):
-"""Create the recapture array from a capture history."""
- _, occasion_count = history.shape
- interval_count = occasion_count -1
-
- recapture_array = np.zeros((interval_count, interval_count), int)
-for occasion inrange(occasion_count -1):
-
-# which individuals, captured at t, were later recaptured?
- captured_this_time = history[:, occasion] ==1
- captured_later = (history[:, (occasion +1):] >0).any(axis=1)
- now_and_later = captured_this_time & captured_later
-
-# when were they next recaptured?
- remaining_history = history[now_and_later, (occasion +1):]
- next_capture_occasion = (remaining_history.argmax(axis=1)) + occasion
-
-# how many of them were there?
- ind, count = np.unique(next_capture_occasion, return_counts=True)
- recapture_array[occasion, ind] = count
-
-return recapture_array.astype(int)
+
+
from pymc.distributions.dist_math import factln
+from scipy.linalg import circulant
+
+import seaborn as sns
+import numpy as np
+import matplotlib.pyplot as plt
+import arviz as az
+import pymc as pm
+import pytensor.tensor as pt
+
+plt.style.use('fivethirtyeight')
+plt.rcParams['axes.facecolor'] ='white'
+plt.rcParams['figure.facecolor'] ='white'
+plt.rcParams['axes.spines.left'] =False
+plt.rcParams['axes.spines.right'] =False
+plt.rcParams['axes.spines.top'] =False
+plt.rcParams['axes.spines.bottom'] =False
+sns.set_palette("tab10")
+
+def create_recapture_array(history):
+"""Create the recapture array from a capture history."""
+ _, occasion_count = history.shape
+ interval_count = occasion_count -1
+
+ recapture_array = np.zeros((interval_count, interval_count), int)
+for occasion inrange(occasion_count -1):
+
+# which individuals, captured at t, were later recaptured?
+ captured_this_time = history[:, occasion] ==1
+ captured_later = (history[:, (occasion +1):] >0).any(axis=1)
+ now_and_later = captured_this_time & captured_later
+
+# when were they next recaptured?
+ remaining_history = history[now_and_later, (occasion +1):]
+ next_capture_occasion = (remaining_history.argmax(axis=1)) + occasion
-def create_m_array(history):
-'''Create the m-array from a capture history.'''
-
-# leftmost column of the m-array
- number_released = history.sum(axis=0)
+# how many of them were there?
+ ind, count = np.unique(next_capture_occasion, return_counts=True)
+ recapture_array[occasion, ind] = count
+
+return recapture_array.astype(int)
-# core of the m-array
- recapture_array = create_recapture_array(history)
- number_recaptured = recapture_array.sum(axis=1)
-
-# no animals that were released on the last occasion are recaptured
- number_recaptured = np.append(number_recaptured, 0)
- never_recaptured = number_released - number_recaptured
-
-# add a dummy row at the end to make everything stack
- zeros = np.zeros(recapture_array.shape[1])
- recapture_array = np.row_stack((recapture_array, zeros))
-
-# stack the relevant values into the m-array
- m_array = np.column_stack((number_released, recapture_array, never_recaptured))
-
-return m_array.astype(int)
-
-def fill_lower_diag_ones(x):
-'''Fill the lower diagonal of a matrix with ones.'''
-return pt.triu(x) + pt.tril(pt.ones_like(x), k=-1)
+def create_m_array(history):
+'''Create the m-array from a capture history.'''
+
+# leftmost column of the m-array
+ number_released = history.sum(axis=0)
+
+# core of the m-array
+ recapture_array = create_recapture_array(history)
+ number_recaptured = recapture_array.sum(axis=1)
+
+# no animals that were released on the last occasion are recaptured
+ number_recaptured = np.append(number_recaptured, 0)
+ never_recaptured = number_released - number_recaptured
+
+# add a dummy row at the end to make everything stack
+ zeros = np.zeros(recapture_array.shape[1])
+ recapture_array = np.row_stack((recapture_array, zeros))
+
+# stack the relevant values into the m-array
+ m_array = np.column_stack((number_released, recapture_array, never_recaptured))
+
+return m_array.astype(int)
+
+def fill_lower_diag_ones(x):
+'''Fill the lower diagonal of a matrix with ones.'''
+return pt.triu(x) + pt.tril(pt.ones_like(x), k=-1)
As an example, I use the cormorant data from McCrea and Morgan (2014), Table 4.6. These data come from an eleven year capture-recapture study between 1982 and 1993. These were breeding cormorants of unknown age. The data is summarized in the \(M\)-array below. The last column is the number that were never recapured. The number released can be calculated from the array.
interval_count, T = cormorant.shapenumber_recaptured = cormorant[:,:-1]
@@ -535,7 +542,7 @@
Cormack-Jolly-Seber
)pm.model_to_graphviz(phit)
-
+
-
+
with phit: cjs_idata = pm.sample()
@@ -556,33 +563,18 @@
Cormack-Jolly-Seber
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [p, phi]
-Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 1 seconds.
+Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 3 seconds.
McCrea, Rachel S, and Byron JT Morgan. 2014. Analysis of Capture-Recapture Data. CRC Press.
+
+
@@ -394,26 +395,34 @@
Closed capture-recapture
This produces a capture history for each individual, which allows us to estimate the probability of capture and the number of individuals in the population \(N\).
Model \(M_0\)
-
+
# libraries
-import numpy as np
-import pandas as pd
-import pymc as pm
-import arviz as az
-import matplotlib.pyplot as plt
-from pymc.distributions.dist_math import binomln, logpow
-
-# plotting parameters
-plt.style.use('fivethirtyeight')
-plt.rcParams['axes.facecolor'] ='white'
-plt.rcParams['figure.facecolor'] ='white'
-
-# hyperparameters
-SEED =808
-RNG = np.random.default_rng(SEED)
+from pymc.distributions.dist_math import binomln, logpow
+
+import vapeplot
+import seaborn as sns
+import numpy as np
+import pandas as pd
+import pymc as pm
+import arviz as az
+import matplotlib.pyplot as plt
+
+# plotting parameters
+plt.style.use('fivethirtyeight')
+plt.rcParams['axes.facecolor'] ='white'
+plt.rcParams['figure.facecolor'] ='white'
+plt.rcParams['axes.spines.left'] =False
+plt.rcParams['axes.spines.right'] =False
+plt.rcParams['axes.spines.top'] =False
+plt.rcParams['axes.spines.bottom'] =False
+sns.set_palette("tab10")
+
+# hyperparameters
+SEED =808
+RNG = np.random.default_rng(SEED)
I explore fitting the simplest closed capture-recapture model, Model \(M_0,\) through parameter-expanded data-augmentation (PX-DA, Royle and Dorazio 2008). The idea with PX-DA is to augment the capture histories with \(M-n\) all zero capture-histories, where \(M\) is a hyperparameter that should be much greater than the true population size \(N,\) and \(n\) is the total number of individuals that were captured during the study. This allows us to treat the data as a zero-inflated binomial distribution (see below).
-
+
def augment_history(history, M):'''Augment a capture history with all-zero histories.'''
@@ -429,7 +438,7 @@
Model \(M_0\)
return augmented
To demonstrate this approach, I use the salamander dataset from Bailey, Simons, and Pollock (2004), as demonstrated in Hooten and Hefley (2019), Chapter 24. These data were collected on two salamander species, the red-cheeked salamander (Plethodon jordani) and the pygmy salamander (Desmognathus wrighti), in Great Smoky Mountains National Park. The salamanders were counted in 15m by 15m square plots. In this case, we augment the history by setting \(M=1500\). There were \(n=92\) individual red-cheeked and \(n=132\) pygmy salamanders captured during the course of the survey.
-
+
def get_histories():'''Read, augment, and recombine the salamander histories.'''
@@ -480,7 +489,7 @@
Model \(M_0\)
with pm.Model(coords=coords) as M0:# priors for the capture and inclusion probabilities
- psi = pm.Uniform('psi', 0, 1, dims='species')
+ psi = pm.Beta('psi', 0.001, 1, dims='species') p = pm.Uniform('p', 0, 1, dims='species')# likelihood for the summarized data
@@ -506,7 +515,7 @@
Model \(M_0\)
-
+
with M0: M0_idata = pm.sample()
@@ -514,33 +523,18 @@
Model \(M_0\)
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [psi, p]
-Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 8 seconds.
+Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 18 seconds.
For faster sampling, it’s better to separate the two species into two separate models. On my machine, the individual species models finish sampling in 2-3 seconds, compared to 15-20 seconds for the two species model. That said, the two species model is somewhat more convenient.
Of course, the trace plots lack our true parameter of interest: the population size \(N.\) We can simulate the posterior of \(N\) as a derived quantity, using \(M\) and the posterior distribution of \(\psi\).
Figure 4: Posterior draws of \(N\) and \(p\) for both species of salamander.
@@ -645,7 +639,7 @@
Model \(M_0\)
Model \(M_b\)
Next, I fit model \(M_b,\) which accounts for the possibility that the capture probability changes after the animal is first caught. This could be from trap happiness, whereby animals are more likely to be trapped after their first time. Conversely, this could be from subsequent trap avoidance.
# read in the microtus datamicrotus = np.loadtxt('microtus.data.txt').astype(int)
@@ -689,7 +683,7 @@
Model \(M_b\)
with pm.Model(coords=coords) as mb:# priors for the capture and inclusion probabilities
- psi = pm.Uniform('psi', 0, 1)
+ psi = pm.Beta('psi', 0.001, 1) Alpha = pm.Normal('Alpha', 0, 2, dims='alpha_coeffs') nu = pm.math.dot(X, Alpha)
@@ -719,7 +713,7 @@
Model \(M_b\)
-
+
with mb: mb_idata = pm.sample()
@@ -727,36 +721,21 @@
Model \(M_b\)
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [psi, Alpha]
-Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 1 seconds.
+Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 3 seconds.
The idea with distance sampling, also known as line-transect sampling, is that a surveyer traverses a transect, typically in a boat or a plane. As they survey, they note when they detect an individual, or a group, from the species of interest, and further note the distance from the transect to the animal. Further, they note the angle to the animal(s), such that they can calculate the perpendicular distance from the animal to the transect. We assume that probability of detecting an animal \(p\) decreases monotonically as the distance from the transect grows, e.g., \(p=\exp(-x^2/\sigma^2),\) where \(x\) is the distance and \(\sigma\) is a scale parameter to be estimated. These simple assumptions permit the estimation of the population size \(N\) as well as density \(D.\)
Following Hooten and Hefley (2019), Chapter 24 and Royle and Dorazio (2008), Chapter 7, I use the impala data from Burnham, Anderson, and Laake (1980), who credits P. Hemingway with the dataset. In this dataset, 73 impalas were observed along a 60km transect. The distance values below are the perpendicular distances, in meters, from the transect.
Again, we treat this as a zero-inflated binomial model using PX-DA. The trick for doing so is to create a binary vector of length \(M\), \(y,\) that represents whether the individual was detected during the study. Then, combine the indicator with the distance vector \(x\) to create a the full dataset \((x,y).\)
-
+
n =len(x_observed)unobserved_count = M - nzeros = np.zeros(unobserved_count)
@@ -412,7 +419,7 @@
This model samples slower than the models in the other notebooks, presumably because of the unobserved \(x.\) As in the closed capture-recapture notebook, we will have to simulate the posterior for \(N\) using the posterior distribution of \(\psi\) and \(M.\)
# ax0.set_xlim((100, M))# axes labels
-ax0.set_xlabel('Abundance $N$')
+ax0.set_xlabel(r'Abundance $N$')ax0.set_ylabel('Number of samples')
-ax1.set_xlabel('Detection range $\sigma$')
+ax1.set_xlabel(r'Detection range $\sigma$')# add the point estimatesN_hat = N_samples.mean()sigma_hat = sigma_samples.mean()
-ax0.text(200, 350, f'$\hat{{N}}$={N_hat:.1f}', ha='left', va='center')
-ax1.text(205, 350, f'$\hat{{\sigma}}$={sigma_hat:.1f}', ha='left', va='center')
+ax0.text(200, 350, rf'$\hat{{N}}$={N_hat:.1f}', ha='left', va='center')
+ax1.text(205, 350, rf'$\hat{{\sigma}}$={sigma_hat:.1f}', ha='left', va='center')# the results from royle and dorazio (2008) for comparisonN_hat_royle =179.9
@@ -541,7 +533,7 @@
Distance sampling
+
+
@@ -341,73 +342,79 @@
Jolly-Seber-Schwarz-Arnason
In this notebook, I explore the Jolly-Seber-Schwarz-Arnason (JSSA) model for estimating survival and abundance using capture recapture data. JSSA is very similar to the CJS framework, except that it also models entry into the population, permitting esimation of the superpopulation size. Like the CJS notebook, I have drawn considerable inspiration from Austin Rochford’s notebook on capture-recapture in PyMC, the second chapter of my dissertation (a work in progress), and McCrea and Morgan (2014).
As a demonstration of the JSSA framework, I use the classic European dipper data of Lebreton et al. (1992). I first convert the dataset into the \(M\)-array, since the data is in capture history format.
-
-
import numpy as np
-import matplotlib.pyplot as plt
-import arviz as az
-import pymc as pm
-import pytensor.tensor as pt
-
-from pymc.distributions.dist_math import factln
-from scipy.linalg import circulant
-
-plt.style.use('fivethirtyeight')
-plt.rcParams['axes.facecolor'] ='white'
-plt.rcParams['figure.facecolor'] ='white'
-
-def create_recapture_array(history):
-"""Create the recapture array from a capture history."""
- _, occasion_count = history.shape
- interval_count = occasion_count -1
-
- recapture_array = np.zeros((interval_count, interval_count), int)
-for occasion inrange(occasion_count -1):
-
-# which individuals, captured at t, were later recaptured?
- captured_this_time = history[:, occasion] ==1
- captured_later = (history[:, (occasion +1):] >0).any(axis=1)
- now_and_later = captured_this_time & captured_later
-
-# when were they next recaptured?
- remaining_history = history[now_and_later, (occasion +1):]
- next_capture_occasion = (remaining_history.argmax(axis=1)) + occasion
-
-# how many of them were there?
- ind, count = np.unique(next_capture_occasion, return_counts=True)
- recapture_array[occasion, ind] = count
-
-return recapture_array.astype(int)
+
+
from pymc.distributions.dist_math import factln
+from scipy.linalg import circulant
+
+import seaborn as sns
+import numpy as np
+import matplotlib.pyplot as plt
+import arviz as az
+import pymc as pm
+import pytensor.tensor as pt
+
+plt.style.use('fivethirtyeight')
+plt.rcParams['axes.facecolor'] ='white'
+plt.rcParams['figure.facecolor'] ='white'
+plt.rcParams['axes.spines.left'] =False
+plt.rcParams['axes.spines.right'] =False
+plt.rcParams['axes.spines.top'] =False
+plt.rcParams['axes.spines.bottom'] =False
+sns.set_palette("tab10")
+
+def create_recapture_array(history):
+"""Create the recapture array from a capture history."""
+ _, occasion_count = history.shape
+ interval_count = occasion_count -1
+
+ recapture_array = np.zeros((interval_count, interval_count), int)
+for occasion inrange(occasion_count -1):
+
+# which individuals, captured at t, were later recaptured?
+ captured_this_time = history[:, occasion] ==1
+ captured_later = (history[:, (occasion +1):] >0).any(axis=1)
+ now_and_later = captured_this_time & captured_later
+
+# when were they next recaptured?
+ remaining_history = history[now_and_later, (occasion +1):]
+ next_capture_occasion = (remaining_history.argmax(axis=1)) + occasion
-def create_m_array(history):
-'''Create the m-array from a capture history.'''
-
-# leftmost column of the m-array
- number_released = history.sum(axis=0)
+# how many of them were there?
+ ind, count = np.unique(next_capture_occasion, return_counts=True)
+ recapture_array[occasion, ind] = count
+
+return recapture_array.astype(int)
-# core of the m-array
- recapture_array = create_recapture_array(history)
- number_recaptured = recapture_array.sum(axis=1)
-
-# no animals that were released on the last occasion are recaptured
- number_recaptured = np.append(number_recaptured, 0)
- never_recaptured = number_released - number_recaptured
-
-# add a dummy row at the end to make everything stack
- zeros = np.zeros(recapture_array.shape[1])
- recapture_array = np.row_stack((recapture_array, zeros))
-
-# stack the relevant values into the m-array
- m_array = np.column_stack((number_released, recapture_array, never_recaptured))
-
-return m_array.astype(int)
-
-def fill_lower_diag_ones(x):
-'''Fill the lower diagonal of a matrix with ones.'''
-return pt.triu(x) + pt.tril(pt.ones_like(x), k=-1)
+def create_m_array(history):
+'''Create the m-array from a capture history.'''
+
+# leftmost column of the m-array
+ number_released = history.sum(axis=0)
+
+# core of the m-array
+ recapture_array = create_recapture_array(history)
+ number_recaptured = recapture_array.sum(axis=1)
+
+# no animals that were released on the last occasion are recaptured
+ number_recaptured = np.append(number_recaptured, 0)
+ never_recaptured = number_released - number_recaptured
+
+# add a dummy row at the end to make everything stack
+ zeros = np.zeros(recapture_array.shape[1])
+ recapture_array = np.row_stack((recapture_array, zeros))
+
+# stack the relevant values into the m-array
+ m_array = np.column_stack((number_released, recapture_array, never_recaptured))
-dipper = np.loadtxt('dipper.csv', delimiter=',').astype(int)
-dipper[:5]
-
+return m_array.astype(int)
+
+def fill_lower_diag_ones(x):
+'''Fill the lower diagonal of a matrix with ones.'''
+return pt.triu(x) + pt.tril(pt.ones_like(x), k=-1)
+
+dipper = np.loadtxt('dipper.csv', delimiter=',').astype(int)
+dipper[:5]
The JSSA model requires modeling the number of unmarked animals that were released during an occasion. We can calculate this using the \(m\)-array by subtracting the number of marked animals who were released from the total number of released animals.
Similar to the CJS model, this model requires a number of tricks to vectorize the operations. Many pertain to the distribution of the unmarked individuals. Similar to occupancy notebook, I use a custom distribution to model the entrants into the population. Austin Rochford refers to this as an incomplete multinomial distribution.
As a motivating example, I use the breeding bird survey (BBS) data used by Dorazio and Royle (2005) and Royle and Dorazio (2008), Chapter 12. This is a \((n, J)\) matrix with the number of times each species was detected over \(K\) surveys, where \(n\) is the number of detected species and \(J\) is the number surveyed sites. In this example, \(n=99\) species were detected at the \(J=50\) sites over the \(K=11\) surveys in New Hampshire. The BBS occurs on routes across the US. This dataset represents one route.
-
-
import numpy as np
-import pandas as pd
-import pymc as pm
-import arviz as az
-import pandas as pd
-import matplotlib.pyplot as plt
-from matplotlib.patches import Patch
-
-SEED =808
-RNG = np.random.default_rng(SEED)
-
-plt.style.use('fivethirtyeight')
-plt.rcParams['axes.facecolor'] ='white'
-plt.rcParams['figure.facecolor'] ='white'
-
-def invlogit(x):
-return1/ (1+ np.exp(-x))
-
-# read in the detection data
-nh17 = pd.read_csv('detectionFreq.NH17.csv')
-Y = nh17.to_numpy()
-n, J = Y.shape
-K = Y.max()
+
+
import seaborn as sns
+import numpy as np
+import pandas as pd
+import pymc as pm
+import arviz as az
+import pandas as pd
+import matplotlib.pyplot as plt
+from matplotlib.patches import Patch
+
+SEED =808
+RNG = np.random.default_rng(SEED)
+
+plt.style.use('fivethirtyeight')
+plt.rcParams['axes.facecolor'] ='white'
+plt.rcParams['figure.facecolor'] ='white'
+plt.rcParams['axes.spines.left'] =False
+plt.rcParams['axes.spines.right'] =False
+plt.rcParams['axes.spines.top'] =False
+plt.rcParams['axes.spines.bottom'] =False
+sns.set_palette("tab10")
+
+def invlogit(x):
+return1/ (1+ np.exp(-x))
-# convert the species names to ints
-species_idx, lookup = nh17.index.factorize() # lookup[int] returns the actual name
-
-# plot the detection frequencies
-fig, ax = plt.subplots(figsize=(4, 6))
-im = ax.imshow(Y[np.argsort(Y.sum(axis=1))], aspect='auto')
-ax.set_ylabel('Species')
-ax.set_xlabel('Site')
+# read in the detection data
+nh17 = pd.read_csv('detectionFreq.NH17.csv')
+Y = nh17.to_numpy()
+n, J = Y.shape
+K = Y.max()
+
+# convert the species names to ints
+species_idx, lookup = nh17.index.factorize() # lookup[int] returns the actual name
-# add a legend
-values = np.unique(Y.ravel())[1::2]
-colors = [ im.cmap(im.norm(value)) for value in values]
-patches = [ Patch(color=colors[i], label=f'{v}') for i, v inenumerate(values) ]
-plt.legend(title='Detections', handles=patches, bbox_to_anchor=(1, 1), loc=2, borderaxespad=0.)
-ax.grid(False)
-plt.show()
+# plot the detection frequencies
+fig, ax = plt.subplots(figsize=(4, 6))
+im = ax.imshow(Y[np.argsort(Y.sum(axis=1))], aspect='auto')
+ax.set_ylabel('Species')
+ax.set_xlabel('Site')
+
+# add a legend
+values = np.unique(Y.ravel())[1::2]
+colors = [ im.cmap(im.norm(value)) for value in values]
+patches = [ Patch(color=colors[i], label=f'{v}') for i, v inenumerate(values) ]
+plt.legend(title='Detections', handles=patches, bbox_to_anchor=(1, 1), loc=2, borderaxespad=0.)
+ax.grid(False)
+plt.show()
@@ -414,7 +421,7 @@
US Breeding Bird Survey
Known \(N\)
First, I fit the the known \(N\) version of the model. The goal of this version is to estimate occurrence and detection for each species, without estimating species richness.
This notebook makes extensive use of the coords feature in PyMC. Coords makes it easier to incorporate the species-level effects via the multivariate normal. I use a \(\text{Normal}(0, 2)\) prior for both \(\mu\) parameters, and a LKJ Cholesky covariance prior for \(\mathbf{\Sigma}.\)
Known \(
pm.ZeroInflatedBinomial('Y', p=p, psi=psi, n=K, observed=Y)pm.model_to_graphviz(known)
-
+
@@ -458,7 +465,7 @@
Known \(
-
+
with known: known_idata = pm.sample()
@@ -466,36 +473,23 @@
Known \(
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [mu, chol, ab]
-Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 43 seconds.
+/Users/philtpatton/miniforge3/envs/pymc/lib/python3.12/site-packages/pytensor/compile/function/types.py:959: RuntimeWarning: invalid value encountered in accumulate
+ self.vm()
+Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 47 seconds.
Next, I train the unknown \(N\) version of the model. Like many other notebooks in this series, it relies on augmenting the detection histories with all-zero histories. These represent the detection histories for species that may use the study site, but were not detected over the \(K=11\) surveys. I also augment the species names in the coords dict, such that we can still use the dims argument in the multivariate normal. Mirroring Royle and Dorazio (2008), I augment the history \(M - n\) all-zero histories, where \(M=250\) and \(n\) is the number of species detected during the survey.
Similar to the occupancy notebook, I use a CustomDist to model the augmented history. This accounts for the “row-level” zero-inflation, whereby we know that the species is included in the super community if it was detected along the BBS route. The only difference with this logp is that it uses a ZeroInflatedBinomial distribution under the hood, rather than a Bernoulli, and uses the parameter \(\Omega\) to account for the row-level inflation.
-
+
M =250all_zero_history = np.zeros((M - n, J))Y_augmented = np.row_stack((Y, all_zero_history))
@@ -572,7 +566,7 @@
Unknown with pm.Model(coords=coords) as unknown:# priors for inclusion
- omega = pm.Uniform('omega', 0, 1)
+ omega = pm.Beta('omega', 0.001, 1)# priors for community-level means for detection and occurrence mu = pm.Normal('mu', 0, 2, dims='process')
@@ -606,7 +600,7 @@
Unknown )pm.model_to_graphviz(unknown)
-
+
@@ -619,7 +613,7 @@
Unknown
+
with unknown: unknown_idata = pm.sample()
@@ -627,41 +621,143 @@
Unknown
-
-
+
+
+
+
+
+
+
+
We see some warnings about the effective sample size and the \(\hat{R}\) statistic. Some of these warnings may just relate to the individual random effects.
Unknown \(N.\) This is slightly more complicated than before sinc there is an additional level of zero-inflation (included and never detected or not-included) in this model compared to the occupancy model (present and never detection or not present).
-
# relevant posterior samples
-post = az.extract(unknown_idata)
-o_samps = post.omega.to_numpy()
-psi_samps = post.psi.to_numpy()[n:, :]
-p_samps = post.p.to_numpy()[n:, :]
-
-# probability that the animal was never detected during the survey if present
-p_not_detected = (1- p_samps) ** K
-
-# probability of a zero detection history
-p_zero_hist = psi_samps * p_not_detected + (1- psi_samps)
-
-# probability that the species was included in the given the all-zero history
-p_included = (o_samps * p_zero_hist ** J) / (o_samps * p_zero_hist ** J + (1- o_samps))
-
-# posterior samples of N
-number_undetected = RNG.binomial(1, p_included).sum(axis=0)
-N_samps = n + number_undetected
-
-# posterior distribution
-N_hat_royle =138
-fig, ax = plt.subplots(figsize=(6, 4))
-ax.hist(N_samps, edgecolor='white', bins=25)
-ax.set_xlabel('Species richness $N$')
-ax.set_ylabel('Posterior samples')
-ax.axvline(N_hat_royle, linestyle='--', color='C1')
-ax.axvline(N_samps.mean(), linestyle='--', color='C2')
-plt.show()
+
# relevant posterior samples
+post = az.extract(unknown_idata)
+o_samps = post.omega.to_numpy()
+psi_samps = post.psi.to_numpy()[n:, :]
+p_samps = post.p.to_numpy()[n:, :]
+
+# probability that the animal was never detected during the survey if present
+p_not_detected = (1- p_samps) ** K
+
+# probability of a zero detection history
+p_zero_hist = psi_samps * p_not_detected + (1- psi_samps)
+
+# probability that the species was included in the given the all-zero history
+p_included = (o_samps * p_zero_hist ** J) / (o_samps * p_zero_hist ** J + (1- o_samps))
+
+# posterior samples of N
+number_undetected = RNG.binomial(1, p_included).sum(axis=0)
+N_samps = n + number_undetected
+
+# posterior distribution
+N_hat_royle =138
+fig, ax = plt.subplots(figsize=(6, 4))
+ax.hist(N_samps, edgecolor='white', bins=25)
+ax.set_xlabel('Species richness $N$')
+ax.set_ylabel('Posterior samples')
+ax.axvline(N_hat_royle, linestyle='--', color='C1')
+ax.axvline(N_samps.mean(), linestyle='--', color='C2')
+plt.show()
In this notebook, I train the simplest possible SCR model, SCR0 (Royle et al. 2013, chap. 5), where the goal is estimating the true population size \(N\). Similar to the other closed population notebooks, I do so using parameter-expanded data-augmentation (PX-DA). I also borrow the concept of the detection function from the distance sampling notebook.
As a motivating example, I use the ovenbird mist netting dataset provided by Murray Efford via the secr package in R. The design of the study is outlined in Efford, Dawson, and Robbins (2004) and Borchers and Efford (2008). In this dataset, ovenbirds were trapped in 44 mist nets over 8 to 10 consecutive days during the summers of 2005 to 2009.
-
import numpy as np
-import pandas as pd
-import matplotlib.pyplot as plt
-import pytensor.tensor as pt
-import pymc as pm
-import arviz as az
-from pymc.distributions.dist_math import binomln, logpow
-
-# hyper parameters
-SEED =42
-RNG = np.random.default_rng(SEED)
-BUFFER =100
-M =200
-
-# plotting defaults
-plt.style.use('fivethirtyeight')
-plt.rcParams['axes.facecolor'] ='white'
-plt.rcParams['figure.facecolor'] ='white'
-
-def invlogit(x):
-'''Inverse logit function'''
-return1/ (1+ np.exp(-x))
-
-def euclid_dist(X, S, library='np'):
-'''Pairwise euclidian distance between points in (M, 2) and (N, 2) arrays'''
- diff = X[np.newaxis, :, :] - S[:, np.newaxis, :]
-
-if library =='np':
-return np.sqrt(np.sum(diff **2, axis=-1))
-
-elif library =='pm':
-return pm.math.sqrt(pm.math.sum(diff **2, axis=-1))
-
-def half_normal(d, s, library='np'):
-'''Half normal detection function.'''
-if library =='np':
-return np.exp( - (d **2) / (2* s **2))
-
-elif library =='pm':
-return pm.math.exp( - (d **2) / (2* s **2))
-
-def exponential(d, s, library='np'):
-'''Negative exponential detection function.'''
-if library =='np':
-return np.exp(- d / s)
-
-elif library =='pm':
-return pm.math.exp(- d / s)
-
-# coordinates for each trap
-ovenbird_trap = pd.read_csv('ovenbirdtrap.txt', delimiter=' ')
-trap_count, _ = ovenbird_trap.shape
-
-# information about each trap
-trap_x = ovenbird_trap.x
-trap_y = ovenbird_trap.y
-X = ovenbird_trap[['x', 'y']].to_numpy()
-
-# define the state space around the traps
-x_max = trap_x.max() + BUFFER
-y_max = trap_y.max() + BUFFER
-x_min = trap_x.min() - BUFFER
-y_min = trap_y.min() - BUFFER
+
One difference between spatial and traditional (non-spatial) capture is the addition of the trap identifier in the capture history. Whereas a traditional capture history is [individual, occasion], a spatial capture history might be [individual, occasion, trap].
In the ovenbird example, I ignore the year dimension, pooling parameters across years, which allows for better estimation of the detection parameters. My hack for doing so is treating every band/year combination as a unique individual in a combined year capture history. This is easy to implement, creates an awkward interpretation of \(N\) (see below).
Before estimating the parameters, I perform a small simulation. The simulation starts with a core idea of SCR: the activity center. The activity center \(\mathbf{s}_i\) is the most likely place that you’d find an individual \(i\) over the course of the trapping study. In this case, I assume that activity centers are uniformly distributed across the sample space.
I compute the probability of detection for individual \(i\) at trap \(j\) as \(p_{i,j}=g_0 \exp(-d_{i,j}^2/2\sigma^2),\) where \(g_0\) is the probability of detecting an individual when it’s activity center is at the trap, \(d_{i,j}\) is the euclidean distance between the trap and the activity center, and \(\sigma\) is the detection range parameter.
-
+
# true population sizeN =150
@@ -627,7 +639,7 @@
Simulation
)pm.model_to_graphviz(known)
-
+
@@ -640,7 +652,7 @@
Simulation
-
+
with known: known_idata = pm.sample()
@@ -648,38 +660,23 @@
Simulation
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sx, sy, g0, sigma]
-Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 12 seconds.
+Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 30 seconds.
Now, I estimate the density \(D\) for the ovenbird population. Like distance sampling, SCR can robustly estimate the density of the population, regardless of the size of the state space. The difference between the model above and this one is that we use PX-DA to estimate the inclusion probability \(\psi,\) and subsequently \(N.\) First, I convert the DataFrame to a (n_detected, n_traps) array of binomial counts.
-
+
def get_Y(ch):'''Get a (individual_count, trap_count) array of detections.'''
@@ -805,7 +802,7 @@
The estimates are quite close to the maximum likelihood estimates, which I estimated using the secr package in R.
Finally, I estimate density \(D\) using the results. As in the closed capture-recapture and distance sampling notebooks, I use the posterior samples of \(\psi\) and \(M\) to sample the posterior of \(N.\) This \(N,\) however, has an awkward interpretation because I pooled across the years by combining all the detection histories. To get around this, I compute the average annual abundance by dividing by the total number of years in the sample. Then, I divide by the area of the state space.
ax.axvline(D_mle, linestyle='--',color='C1')ax.set_xlabel('Ovenbirds per hectare')ax.set_ylabel('Number of samples')
-ax.text(1.4, 800, f'$\hat{{D}}$={D_samples.mean():.2f}', va='bottom', ha='left')
+ax.text(1.4, 800, rf'$\hat{{D}}$={D_samples.mean():.2f}', va='bottom', ha='left')plt.show()
-
+
Figure 6: Posterior distribution of the density \(D\) of ovenbirds. The maximum likelihood estimate is shown by the dotted red line.
@@ -1014,97 +996,114 @@
Ovenbird density
-
I also plot the estimated activity centers for every detected individual, as well as the posterior distribution for two of the detected individuals.
+
Sometimes, the location of the activity centers is of interest. Below, I plot the posterior median for the activity centers for the detected individuals,
-Figure 7: Estimated activity centers for the detected individuals, and posterior distributions for two of them.
+Figure 7: Estimated activity centers for the detected individuals
-
Finally, I plot the posterior distribution of the detection function.
-Figure 8: Posterior distribution for the detection function. The line represents the posterior mean while the shaded area is the 96% interval.
+Figure 9: Posterior distribution for the detection function. The line represents the posterior mean while the shaded area is the 96% interval.
@@ -1134,6 +1133,9 @@
Ovenbird density
Royle, J Andrew, Richard B Chandler, Rahel Sollmann, and Beth Gardner. 2013. Spatial Capture-Recapture. Academic press.