paper/cpts.tex

% Options for packages loaded elsewhere
\PassOptionsToPackage{unicode}{hyperref}
\PassOptionsToPackage{hyphens}{url}
\PassOptionsToPackage{dvipsnames,svgnames,x11names}{xcolor}
%
\documentclass[
  11pt,
  12pt]{article}

\usepackage{amsmath,amssymb}
\usepackage{setspace}
\usepackage{iftex}
\ifPDFTeX
  \usepackage[T1]{fontenc}
  \usepackage[utf8]{inputenc}
  \usepackage{textcomp} % provide euro and other symbols
\else % if luatex or xetex
  \usepackage{unicode-math}
  \defaultfontfeatures{Scale=MatchLowercase}
  \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
\fi
\usepackage{lmodern}
\ifPDFTeX\else  
    % xetex/luatex font selection
\fi
% Use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
\IfFileExists{microtype.sty}{% use microtype if available
  \usepackage[]{microtype}
  \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
}{}
\makeatletter
\@ifundefined{KOMAClassName}{% if non-KOMA class
  \IfFileExists{parskip.sty}{%
    \usepackage{parskip}
  }{% else
    \setlength{\parindent}{0pt}
    \setlength{\parskip}{6pt plus 2pt minus 1pt}}
}{% if KOMA class
  \KOMAoptions{parskip=half}}
\makeatother
\usepackage{xcolor}
\setlength{\emergencystretch}{3em} % prevent overfull lines
\setcounter{secnumdepth}{5}
% Make \paragraph and \subparagraph free-standing
\makeatletter
\ifx\paragraph\undefined\else
  \let\oldparagraph\paragraph
  \renewcommand{\paragraph}{
    \@ifstar
      \xxxParagraphStar
      \xxxParagraphNoStar
  }
  \newcommand{\xxxParagraphStar}[1]{\oldparagraph*{#1}\mbox{}}
  \newcommand{\xxxParagraphNoStar}[1]{\oldparagraph{#1}\mbox{}}
\fi
\ifx\subparagraph\undefined\else
  \let\oldsubparagraph\subparagraph
  \renewcommand{\subparagraph}{
    \@ifstar
      \xxxSubParagraphStar
      \xxxSubParagraphNoStar
  }
  \newcommand{\xxxSubParagraphStar}[1]{\oldsubparagraph*{#1}\mbox{}}
  \newcommand{\xxxSubParagraphNoStar}[1]{\oldsubparagraph{#1}\mbox{}}
\fi
\makeatother


\providecommand{\tightlist}{%
  \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}\usepackage{longtable,booktabs,array}
\usepackage{calc} % for calculating minipage widths
% Correct order of tables after \paragraph or \subparagraph
\usepackage{etoolbox}
\makeatletter
\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{}
\makeatother
% Allow footnotes in longtable head/foot
\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}}
\makesavenoteenv{longtable}
\usepackage{graphicx}
\makeatletter
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
\makeatother
% Scale images if necessary, so that they will not overflow the page
% margins by default, and it is still possible to overwrite the defaults
% using explicit options in \includegraphics[width, height, ...]{}
\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
% Set default figure placement to htbp
\makeatletter
\def\fps@figure{htbp}
\makeatother

\addtolength{\oddsidemargin}{-.5in}%
\addtolength{\evensidemargin}{-1in}%
\addtolength{\textwidth}{1in}%
\addtolength{\textheight}{1.7in}%
\addtolength{\topmargin}{-1in}%
\usepackage{tikz}
\usetikzlibrary{decorations.pathreplacing}
\usetikzlibrary{positioning}
\usetikzlibrary{shapes.arrows,shapes.symbols,arrows.meta}
\usepackage[no-weekday]{eukdate}
\usepackage{bm}
\usepackage{mathptmx}
\usepackage{amsmath} % Ensure amsmath is included for \eqref
\usepackage[]{natbib}
\usepackage[top=2cm, bottom=2cm, left=2.5cm, right=2.5cm]{geometry}
\allowdisplaybreaks
\sloppy
\setlength{\parskip}{0pt}
\setlength{\parindent}{15pt}
\expandafter\def\expandafter\normalsize\expandafter{%
  \normalsize
  \setlength\abovedisplayskip{1ex}
  \setlength\belowdisplayskip{1ex}
  \setlength\abovedisplayshortskip{1ex}
  \setlength\belowdisplayshortskip{1ex}
}
\setlength{\bibsep}{0pt plus 0.3ex}
\usepackage{microtype}
\usepackage[onlyrefs]{mathtools}
\setlength{\tabcolsep}{0.11cm}
\usepackage{threeparttable}
\makeatletter
\g@addto@macro\TPT@defaults{\linespread{1}\selectfont}
\def\fps@table{!tbh}
\def\fps@figure{!tbh}
\makeatother

\usepackage{caption}
\DeclareCaptionStyle{italic}[justification=centering]
 {labelfont={bf},textfont={it},labelsep=colon}
\captionsetup[figure]{style=italic,format=hang,singlelinecheck=true}
\captionsetup[table]{style=italic,format=hang,singlelinecheck=true}

\widowpenalty=10000
\clubpenalty=10000
\brokenpenalty=10000
\makeatletter
\@ifpackageloaded{caption}{}{\usepackage{caption}}
\AtBeginDocument{%
\ifdefined\contentsname
  \renewcommand*\contentsname{Table of contents}
\else
  \newcommand\contentsname{Table of contents}
\fi
\ifdefined\listfigurename
  \renewcommand*\listfigurename{List of Figures}
\else
  \newcommand\listfigurename{List of Figures}
\fi
\ifdefined\listtablename
  \renewcommand*\listtablename{List of Tables}
\else
  \newcommand\listtablename{List of Tables}
\fi
\ifdefined\figurename
  \renewcommand*\figurename{Figure}
\else
  \newcommand\figurename{Figure}
\fi
\ifdefined\tablename
  \renewcommand*\tablename{Table}
\else
  \newcommand\tablename{Table}
\fi
}
\@ifpackageloaded{float}{}{\usepackage{float}}
\floatstyle{ruled}
\@ifundefined{c@chapter}{\newfloat{codelisting}{h}{lop}}{\newfloat{codelisting}{h}{lop}[chapter]}
\floatname{codelisting}{Listing}
\newcommand*\listoflistings{\listof{codelisting}{List of Listings}}
\usepackage{amsthm}
\theoremstyle{plain}
\newtheorem{proposition}{Proposition}[section]
\theoremstyle{remark}
\AtBeginDocument{\renewcommand*{\proofname}{Proof}}
\newtheorem*{remark}{Remark}
\newtheorem*{solution}{Solution}
\newtheorem{refremark}{Remark}[section]
\newtheorem{refsolution}{Solution}[section]
\makeatother
\makeatletter
\makeatother
\makeatletter
\@ifpackageloaded{caption}{}{\usepackage{caption}}
\@ifpackageloaded{subcaption}{}{\usepackage{subcaption}}
\makeatother

\ifLuaTeX
  \usepackage{selnolig}  % disable illegal ligatures
\fi
\usepackage[]{natbib}
\bibliographystyle{agsm}
\usepackage{bookmark}

\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
\urlstyle{same} % disable monospaced font for URLs
\hypersetup{
  pdftitle={Online conformal inference for multi-step time series forecasting},
  pdfauthor={Xiaoqian Wang; Rob J Hyndman},
  pdfkeywords={Conformal prediction, Coverage
guarantee, Distribution-free inference, Exchangeability, Weighted
quantile estimate},
  colorlinks=true,
  linkcolor={blue},
  filecolor={Maroon},
  citecolor={Blue},
  urlcolor={Blue},
  pdfcreator={LaTeX via pandoc}}


\begin{document}
\def\spacingset#1{\renewcommand{\baselinestretch}%
{#1}\small\normalsize} \spacingset{1}

\renewcommand*{\arraystretch}{0.5} % Specify row height in a table globally

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\title{\bf Online conformal inference for multi-step time series
forecasting}
\author{
Xiaoqian Wang\thanks{Corresponding author. Xiaoqian Wang, Department of
Econometrics \& Business Statistics, Monash University, Clayton VIC,
3800, Australia. E-mail address:
\href{mailto:xiaoqian.wang@monash.edu}{xiaoqian.wang@monash.edu} (X.
Wang).} \vspace{0.2em}\\
Department of Econometrics \& Business Statistics, Monash
University \vspace{0.2em}\\
and \vspace{0.2em}\\Rob J Hyndman \vspace{0.2em}\\
Department of Econometrics \& Business Statistics, Monash
University \vspace{0.2em}\\
}
\maketitle

\bigskip
\bigskip
\begin{abstract}
We consider the problem of constructing distribution-free prediction
intervals for multi-step time series forecasting, with a focus on the
temporal dependencies inherent in multi-step forecast errors. We
establish that the optimal \(h\)-step-ahead forecast errors exhibit
serial correlation up to lag \((h-1)\) under a general non-stationary
autoregressive data generating process. To leverage these properties, we
propose the Autocorrelated Multi-step Conformal Prediction (AcMCP)
method, which effectively incorporates autocorrelations in multi-step
forecast errors, resulting in more statistically efficient prediction
intervals. This method ensures theoretical long-run coverage guarantees
for multi-step prediction intervals, though we note that increased
forecasting horizons may exacerbate deviations from the target coverage,
particularly in the context of limited sample sizes. Additionally, we
extend several easy-to-implement conformal prediction methods,
originally designed for single-step forecasting, to accommodate
multi-step scenarios. Through empirical evaluations, including
simulations and applications to data, we demonstrate that AcMCP achieves
coverage that closely aligns with the target within local windows, while
providing adaptive prediction intervals that effectively respond to
varying conditions.
\end{abstract}

\noindent%
{\it Keywords:} Conformal prediction, Coverage
guarantee, Distribution-free inference, Exchangeability, Weighted
quantile estimate
\vfill

\newpage
%\spacingset{1.8} % DON'T change the spacing!

\setstretch{1.667}
\renewcommand{\theproposition}{\arabic{proposition}} % numbering propositions sequentially and individually by type and not by section
\setcounter{proposition}{0}

\section{Introduction}\label{sec-intro}

Conformal prediction \citep{vovk2005} is a powerful and flexible tool
for uncertainty quantification, distinguished by its simplicity,
generality, and ease of implementation. It constructs valid prediction
intervals that achieve nominal coverage without imposing stringent
assumptions on the data generating distribution, other than requiring
the data to be i.i.d. or, more generally, exchangeable. Its credibility
and potential make it widely applicable for quantifying the uncertainty
of predictions produced by any black-box machine learning model
\citep{shafer2008, papadopoulos2008, barber2021} or non-parametric model
\citep{lei2014}.

Three key classes of conformal prediction methods are widely used for
constructing distribution-free prediction intervals: split conformal
prediction \citep{vovk2005}, full conformal prediction \citep{vovk2005},
and jackknife+ \citep{barber2021}. The split conformal method, which
relies on a holdout set, offers computational efficiency but sacrifices
some statistical efficiency due to data splitting. In contrast, full
conformal prediction avoids data splitting, providing higher accuracy at
the cost of increased computational complexity. Jackknife+ strikes a
balance between these methods, offering a compromise between statistical
precision and computational demands. All three methods guarantee
coverage at the target level under the assumption of data
exchangeability.

Nevertheless, the data exchangeability assumption is often violated in
many applied domains, where challenges such as non-stationarity,
distributional drift, temporal and spatial dependencies are prevalent.
In response, several extensions to conformal prediction have been
proposed to handle non-exchangeable data. Notable examples include
methods for handling covariate shift
\citep{tibshirani2019, lei2021, yang2024}, online distribution shift
\citep{gibbs2021, zaffran2022, bastani2022}, label shift
\citep{podkopaev2021}, time series data
\citep{chernozhukov2018, gibbs2021, xu2021, xu2023, zaffran2022}, and
spatial prediction \citep{mao2024}, and methods based on certain
distributional assumptions of the data rather than exchangeability
\citep{oliveira2024, xu2021, xu2023}. Additionally, some methods propose
weighting nonconformity scores (e.g., prediction errors) differently,
either using non-data-dependent weights \citep{barber2023} or weights
based on observed feature values
\citep{tibshirani2019, guan2023, hore2023}.

Recently, several attempts have been made to enable conformal prediction
on time series data, where exchangeability obviously fails due to
inherent temporal dependencies. One line of research has focused on
developing conformal-type methods that offer coverage guarantees under
certain relaxations of exchangeability. For example, within the full
conformal prediction framework, \citet{chernozhukov2018} and
\citet{yu2022} construct prediction sets for time series by using a
group of permutations that are specifically designed to preserve the
dependence structure in the data, ensuring validity under weak
assumptions on the nonconformity score. In the split conformal
prediction framework, \citet{xu2021} and \citet{xu2023} extend conformal
prediction methods under classical nonparametric assumptions to achieve
asymptotic valid conditional coverage for time series.
\citet{barber2023} use weighted residual distributions to provide
robustness against distribution drift. Additionally,
\citet{oliveira2024} introduce a general framework based on
concentration inequalities and data decoupling properties of the data to
retain asymptotic coverage guarantees across several dependent data
settings.

In a separate strand of research, \citet{gibbs2021} develop adaptive
conformal inference in an online manner to manage temporal distribution
shifts and ensure long-run coverage guarantees. The basic idea is to
adapt the miscoverage rate, \(\alpha\), based on historical miscoverage
frequencies. Follow-up work has refined this idea by introducing
time-dependent step sizes to respond to arbitrary distribution shifts,
as seen in studies by \citet{bastani2022}, \citet{zaffran2022}, and
\citet{gibbs2024}. However, these methods may produce prediction
intervals that are either infinite or null. To address this issue,
recent research has proposed a generalized updating process that tracks
the quantile of the nonconformity score sequence, as discussed by
\citet{bhatnagar2023}, \citet{angelopoulos2024}, and
\citet{angelopoulos2024online}.

Existing conformal prediction methods for time series primarily focus on
single-step forecasting. However, many applications require forecasts
for multiple future time steps, not just one. Related research into
multi-step time series forecasting is limited and does not account for
the temporal dependencies inherent in multi-step forecasts. For example,
\citet{stankeviciute2021} integrate conformal prediction with recurrent
neural networks for multi-step forecasting and then apply Bonferroni
correction to achieve the desired coverage rate. This approach, however,
assumes data independence, which is not often unrealistic for time
series data. \citet{yang2024ts} propose Bellman conformal inference, an
extension of adaptive conformal prediction, to control multi-step
miscoverage rates simultaneously at each time point \(t\) by minimizing
a loss function that balances the average interval length across
forecast horizons with miscoverage. While this method considers
multi-step intervals, it does not account for their temporal
dependencies and may be computationally intensive when solving the
associated optimisation problems at each time step. Additionally,
several extensions to multivariate targets have been explored, see,
e.g., \citet{schlembach2022} and \citet{sun2022}.

We employ a unified notation to formalize the mathematical
representation of conformal prediction for time series data. We consider
a general sequential setting in which we observe a time series
\(\{y_t\}_{t \geq 1}\) generated by an unknown data generating process
(DGP), which may depend on its own past, along with other exogenous
predictors, \(\bm{x}_t=(x_{1,t},\ldots,x_{p,t})^{\prime}\), and their
histories. The distribution of
\(\{(\bm{x}_t, y_t)\}_{t \geq 1} \subseteq \mathbb{R}^p \times \mathbb{R}\)
is allowed to vary over time, to accommodate non-stationary processes.
At each time point \(t\), we aim to forecast \(H\) steps into the
future, providing a \emph{prediction set} (which is a prediction
interval in this setting), \(\hat{\mathcal{C}}_{t+h|t}\), for the
realisation \(y_{t+h}\) for each \(h\in[H]\). The \(h\)-step-ahead
forecast uses the previously observed data
\(\{(\bm{x}_i, y_i)\}_{1 \leq i \leq t}\) along with the new information
of the exogenous predictors \(\{\bm{x}_{t+j}\}_{1\leq j\leq h}\). Note
that we can generate ex-ante forecasts by using forecasts of the
predictors based on information available up to and including time
\(t\). Alternatively, ex-post forecasts are generated assuming that
actual values of the predictors from the forecast period are available.
Given a nominal \emph{miscoverage rate} \(\alpha \in (0,1)\) specified
by the user, we expect the output \(\hat{\mathcal{C}}_{t+h|t}\) to be a
\emph{valid} prediction interval so that \(y_{t+h}\) falls within the
prediction interval \(\hat{\mathcal{C}}_{t+h|t}\) at least
\(100(1-\alpha)\%\) of the time.

Our goal is to achieve long-run coverage for multi-step univariate time
series forecasting. All the proposed methods are grounded in the split
conformal prediction framework and an online learning scheme, which are
well-suited to the sequential nature of time series data. First, we
extend several widely used conformal prediction methods that are
originally designed for single-step forecasting to accommodate
multi-step forecasting scenarios. Second, we provide theoretical proofs
demonstrating that the forecast errors of optimal \(h\)-step-ahead
forecasts approximate a linear combination of at most its lag \((h-1)\)
with respect to forecast horizon when we assume a general non-stationary
autoregressive data generating process. Third, we introduce the
autocorrelated multi-step conformal prediction (AcMCP) method, which
accounts for the autocorrelations of multi-step forecast errors. Our
method is proven to achieve long-run coverage guarantees without making
any assumptions on data distribution shifts. \citet{lopes2024} introduce
ConForME, a method that focuses on generating uniform prediction
intervals across the entire forecast horizon, aiming to ensure the
overall validity of these intervals. In contrast, our method targets
pointwise prediction intervals for each specific forecast horizon,
\(h\in[H]\). We also highlight that for \(t \ll \infty\), increasing the
forecast horizon \(h\) generally leads to greater deviations from the
target coverage, which aligns with our expectations. Finally, we
illustrate the practical utility of these proposed methods through two
simulations and two applications to electricity demand and eating-out
expenditure forecasting.

We developed the \texttt{conformalForecast} package for R, available at
\url{https://github.com/xqnwang/conformalForecast}, to implement the
proposed multi-step conformal prediction methods. All the data and code
to reproduce the experiments are made available at
\url{https://github.com/xqnwang/cpts}.

\section{Online learning with sequential splits}\label{sec-setup}

Let \(z_t = (\bm{x}_t, y_t)\) denote the data point (including the
response \(y_t\) and possibly predictors \(\bm{x}_t\)) at time \(t\).
Suppose that, at each time \(t\), we have a forecasting model
\(\hat{f}_t\) trained using the historical data \(z_{1:t}\). Throughout
the paper, we assume that the predictors are known into the future. In
this way, we perform ex-post forecasting and there is no additional
uncertainty introduced from forecasting the exogenous predictors. Using
the forecasting model \(\hat{f}_t\), we are able to produce \(H\)-step
point forecasts, \(\{\hat{y}_{t+h|t}\}_{h\in[H]}\), using the future
values for the predictors. We define the \emph{nonconformity score} as
the (signed) forecast error \[
s_{t+h|t}=\mathcal{S}(z_{1:t}, y_{t+h}):=y_{t+h}-\hat{f}_t(z_{1:t},\bm{x}_{t+1:h})=y_{t+h}-\hat{y}_{t+h|t}.
\] The task is to employ conformal inference to build \(H\)-step
prediction intervals,
\(\{\hat{\mathcal{C}}_{t+h|t}^{\alpha}(z_{1:t},\bm{x}_{t+1:h})\}_{h\in[H]}\),
at the target coverage level \(1-\alpha\). For brevity, we will use
\(\hat{\mathcal{C}}_{t+h|t}^{\alpha}\) to denote the \(h\)-step-ahead
\(100(1-\alpha)\%\) prediction interval.

\textbf{Sequential split.} In a time series context, it is inappropriate
to perform \emph{random splitting}, a standard strategy in much of the
conformal prediction literature, due to the temporal dependency present
in the data. Instead, throughout the conformal prediction methods
proposed in this paper, we use a \emph{sequential split} to preserve the
temporal structure. For example, the \(t\) available data points,
\(z_{1:t}\), are sequentially split into two consecutive sets, a
\emph{proper training set}
\(\mathcal{D}_{\text{tr}} \subset \{1,\ldots,t_r\}\) and a
\emph{calibration set}
\(\mathcal{D}_{\text{cal}} \subset \{t_r+1,\ldots,t\}\), where
\(t_c=t-t_r \gg H\),
\(\mathcal{D}_{\text{tr}} \cup \mathcal{D}_{\text{cal}} = \{1, \ldots, t\}\),
and
\(\mathcal{D}_{\text{tr}} \cap \mathcal{D}_{\text{cal}} = \varnothing\).
Here, ``proper'' means that the training set is used exclusively for
fitting the model, with no overlap into the calibration set, which is
essential for ensuring the validity of coverage in conformal prediction
\citep{papadopoulos2007}. With sequential splitting, multiple \(H\)-step
forecasts and their respective nonconformity scores can be computed on
the calibration set.

\textbf{Online learning.} We will adapt the following generic online
learning framework for all conformal prediction methods to be discussed
in later sections. The entire procedure for the online learning
framework with sequential splits is also illustrated in
\autoref{fig-flowchart}. This framework updates prediction intervals as
new data points arrive, allowing us to assess their long-run coverage
behaviour. It adopts a standard rolling window evaluation strategy,
although expanding windows could easily be used instead.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  \emph{Initialization}. Train a forecasting model on the initial proper
  training set \(z_{(1+t-t_r):t}\), setting \(t=t_r\). Then generate
  \(H\)-step-ahead point forecasts \(\{\hat{y}_{t+h|t}\}_{h\in[H]}\) and
  compute the corresponding nonconformity scores
  \(\{s_{t+h|t}=\mathcal{S}(z_{(1+t-t_r):t}, y_{t+h})\}_{h\in[H]}\)
  based on the true values \(H\) steps ahead,
  i.e.~\(\{y_{t+h}\}_{h\in[H]}\).
\item
  \emph{Recurring procedure}. Roll the training set forward by one data
  point by setting \(t \rightarrow t+1\). Then repeat step 1 until the
  nonconformity scores on the entire initial calibration set,
  \(\{s_{t+h|t}\}_{t_r \leq t \leq t_r+t_c-h}\) for \(h\in[H]\), are
  computed.
\item
  \emph{Quantile estimation and prediction interval calculation}. Use
  nonconformity scores obtained from the calibration set to perform
  quantile estimation and compute \(H\)-step-ahead prediction intervals
  on the test set.
\item
  \emph{Online updating}. Continuously roll the training set and
  calibration set forward, one data point at a time, to update the
  nonconformity scores for calibration, and then repeat step 3 until
  prediction intervals for the entire test set are obtained. That is,
  \(\{\hat{\mathcal{C}}_{t+h|t}^{\alpha}\}_{t_r+t_c \leq t \leq T-H}\)
  for \(h \in [H]\), where \(T-t_r-t_c\) is the length of the test set
  used for testing coverage. Our goal is to achieve long-run coverage in
  time.
\end{enumerate}

\begin{figure}[!hbtp]
\vspace*{-1.5cm}
\hspace*{-2.5cm}
\input figs/annotation.tex
\vspace*{-1.0cm}

\caption{Diagram of the online learning framework with sequential splits. White: unused data; Gray: training data; Black: forecasts in calibration set; Blue: forecasts in test set.}\label{fig-flowchart}
\end{figure}

\section{Extending conformal prediction for multi-step
forecasting}\label{sec-ext}

In this section, we apply the online learning framework outlined in
Section~\ref{sec-setup} to adapt several popular conformal prediction
methods in order to allow for multi-step forecasting problems. In time
series forecasting, when a model is properly specified and well-trained,
the forecasts that minimize the mean squared error (MSE) can be
considered optimal in the sense that they achieve the lowest possible
expected squared deviation between the forecasts and actual values. One
of the key properties of optimal forecast errors, which holds generally
in linear models, is that the variance of the forecast error
\(e_{t+h|t}\) is non-decreasing in \(h\)
\citep{tong1990, diebold1996, patton2007}. Therefore, we need to apply a
separate conformal prediction procedure for each \(h \in [H]\).

\subsection{Online multi-step split conformal
prediction}\label{online-multi-step-split-conformal-prediction}

Split conformal prediction \citep[SCP, also called inductive conformal
prediction,][]{papadopoulos2002, vovk2005, lei2018}, is a holdout method
for building prediction intervals using a pre-trained model in
regression settings. A key advantage of SCP is its ability to guarantee
coverage by assuming data exchangeability. Time series data are
inherently nonexchangeable due to their temporal dependence and
autocorrelation. Therefore, directly applying SCP to time series data
would violate the method's exchangeability assumption and compromise its
coverage guarantee.

Here we introduce online \textbf{multi-step split conformal prediction}
(MSCP) as a generalization of SCP to recursively update all
\(h\)-step-ahead prediction intervals over time. Instead of assuming
exchangeability, MSCP applies conformal inference in an online fashion,
updating prediction intervals as new data points are received.
Specifically, for each \(h \in [H]\), we consider the following simple
online update to construct prediction intervals on the test set:
\begin{equation}\phantomsection\label{eq-mscp}{
\hat{\mathcal{C}}_{t+h|t}^{\alpha} = \Bigg\{y\in\mathbb{R}: s_{t+h|t}^{y} \leq Q_{1-\alpha}\Bigg(\sum_{i=t-t_c+1}^{t}\frac{1}{t_c+1}\cdot\delta_{s_{i|i-h}}+\frac{1}{t_c+1}\cdot\delta_{+\infty}\Bigg)\Bigg\},
}\end{equation} where \(s_{t+h|t}^{y}:=\mathcal{S}(z_{1:t}, y)\) denotes
the \(h\)-step-ahead nonconformity score calculated at time \(t\) using
a hypothesized test observation \(y\), \(Q_\tau(\cdot)\) denotes the
\(\tau\)-quantile of its argument, and \(\delta_a\) denotes the point
mass at \(a\).

\subsection{Online multi-step weighted conformal
prediction}\label{online-multi-step-weighted-conformal-prediction}

\citet{barber2023} propose a nonexchangeable conformal prediction
procedure (NexCP) that generalizes the SCP method to allow for some
sources of nonexchangeability. The core idea is that a higher weight
should be assigned to a data point that is believed to originate from
the same distribution as the test data. Note that NexCP assumes the
weights are fixed and data-independent. When the data are exchangeable,
NexCP offers the same coverage guarantees as SCP, while the coverage gap
is characterized by the total variation between the swapped
nonconformity score vectors when exchangeability is violated. Thus the
coverage gap may be quite large in time series contexts.

The online \textbf{multi-step weighted conformal prediction} (MWCP)
method we propose here adapts the NexCP method to the online setting for
time series forecasting. MWCP uses weighted quantile estimate for
constructing prediction intervals, contrasting with the MSCP definitions
where all nonconformity scores for calibration are implicitly assigned
equal weight.

We choose fixed weights \(w_i = b^{t+1-i}\), \(b \in (0, 1)\) and
\(i=t-t_c+1,\ldots,t\), for nonconformity scores on the corresponding
calibration set. In this setting, weights decay exponentially as the
nonconformity scores get older, akin to the rationale behind the simple
exponential smoothing method in time series forecasting
\citep{hyndman2021}. Then for each \(h \in [H]\), MWCP updates the
\(h\)-step-ahead prediction interval: \[
\hat{\mathcal{C}}_{t+h|t}^{\alpha} = \Bigg\{y\in\mathbb{R}: s_{t+h|t}^{y} \leq Q_{1-\alpha}\Bigg(\sum_{i=t-t_c+1}^{t}\tilde{w}_i\cdot\delta_{s_{i|i-h}}+\tilde{w}_{t+1}\cdot\delta_{+\infty}\Bigg)\Bigg\},
\] where \(\tilde{w}_i\) and \(\tilde{w}_{t+1}\) are normalized weights
given by \[
\tilde{w}_i = \frac{w_i}{\sum_{i=t-t_c+1}^{t}w_i+1}, \text{ for } i \in \{t-t_c+1,\ldots,t\} \quad \text{and} \quad \tilde{w}_{t+1} =  \frac{1}{\sum_{i=t-t_c+1}^{t}w_i+1}.
\]

\subsection{Multi-step adaptive conformal
prediction}\label{multi-step-adaptive-conformal-prediction}

Next we extend the adaptive conformal prediction (ACP) method proposed
by \citet{gibbs2021} to address multi-step time series forecasting,
introducing the \textbf{multi-step adaptive conformal prediction} (MACP)
method. Assuming that
\(\beta \mapsto \mathbb{P}(y_{t+h} \in \hat{\mathcal{C}}_{t+h|t}^{\beta})\)
is continuous and non-increasing, with
\(\mathbb{P}(y_{t+h} \in \hat{\mathcal{C}}_{t+h|t}^{0}) = 1\) and
\(\mathbb{P}(y_{t+h} \in \hat{\mathcal{C}}_{t+h|t}^{1}) = 0\), an
optimal value \(\alpha_{t+h|t}^{*} \in [0,1]\) exists such that the
realised miscoverage rate of the corresponding prediction interval
closely approximates the nominal miscoverage rate \(\alpha\).
Specifically, for each \(h \in [H]\), we iteratively estimate
\(\alpha_{t+h|t}^{*}\) by updating a parameter \(\alpha_{t+h|t}\)
through a sequential adjustment process
\begin{equation}\phantomsection\label{eq-macp}{
\alpha_{t+h|t} := \alpha_{t+h-1|t-1} + \gamma(\alpha - \mathrm{err}_{t|t-h}).
}\end{equation} Then the \(h\)-step-ahead prediction interval is
computed using Equation \eqref{eq-mscp} by setting
\(\alpha = \alpha_{t+h|t}\). Here, \(\gamma > 0\) denotes a fixed step
size parameter, \(\alpha_{2h|h}\) denotes the initial estimate typically
set to \(\alpha\), and \(\mathrm{err}_{t|t-h}\) denotes the miscoverage
event
\(\mathrm{err}_{t|t-h} = \mathbb{1}\left\{y_t \notin \hat{\mathcal{C}}_{t|t-h}^{\alpha_{t|t-h}}\right\}\).

Equation \eqref{eq-macp} indicates that the correction to the estimation
of \(\alpha_{t+h|t}^{*}\) at time \(t+h\) is determined by the
historical miscoverage frequency up to time \(t\). At each iteration, we
raise the estimate of \(\alpha_{t+h|t}^{*}\) used for quantile
estimation at time \(t+h\) if
\(\hat{\mathcal{C}}_{t|t-h}^{\alpha_{t|t-h}}\) covered \(y_t\), whereas
we lower the estimate if \(\hat{\mathcal{C}}_{t|t-h}^{\alpha_{t|t-h}}\)
did not cover \(y_t\). Thus the miscoverage event has a delayed impact
on the estimation of \(\alpha_{t+h|t}^{*}\) over \(h\) periods,
indicating that the correction of the \(\alpha_{t+h|t}^{*}\) estimate
becomes less prompt with increasing values of \(h\). In particular,
Equation \eqref{eq-macp} reduces to the update for ACP for \(h=1\).

We do not consider the update equation
\(\alpha_{t+1|t-h+1} := \alpha_{t|t-h} + \gamma(\alpha - \mathrm{err}_{t|t-h})\)
in this context, as the available information at time \(t\) is
insufficient to estimate \(\alpha_{t+h|t}^{*}\) required for \(h\)-step
forecasts.

Selecting the parameter \(\gamma\) is pivotal yet challenging.
\citet{gibbs2021} suggest setting \(\gamma\) in proportion to the degree
of variation of the unknown \(\alpha_{t}^{*}\) over time. Several
strategies have been proposed to avoid the necessity of selecting
\(\gamma\). For example, \citet{zaffran2022} use an adaptive aggregation
of multiple ACPs with a set of candidate values for \(\gamma\) ,
determining weights based on their historical performance.
\citet{bastani2022} propose a multivalid prediction algorithm in which
the prediction set is established by selecting a threshold from a
sequence of candidate thresholds. However, both previous methods fail to
promptly adapt to the local changes. To address this limitation,
\citet{gibbs2024} suggest adaptively tuning the step size parameter
\(\gamma\) in an online setting, choosing an ``optimal'' value for
\(\gamma\) from a candidate set of values by assessing their historical
performance.

The theoretical coverage properties of ACP suggest that a larger value
for \(\gamma\) generally results in less deviation from the target
coverage. As there is no restriction on \(\alpha_{t+h|t}\) and it can
drift below \(0\) or above \(1\), a larger \(\gamma\) may lead to
frequent output of null or infinite prediction sets in order to quickly
adapt to the current miscoverage status.

\subsection{Multi-step conformal PID
control}\label{multi-step-conformal-pid-control}

We introduce \textbf{multi-step conformal PID control} method (hereafter
MPID), which extends the PID method \citep{angelopoulos2024}, originally
developed for one-step-ahead forecasting. For each individual forecast
horizon \(h\in[H]\), the iteration of the \(h\)-step-ahead quantile
estimate is given by \begin{equation}\phantomsection\label{eq-mpid}{
q_{t+h|t}=\underbrace{q_{t+h-1|t-1}+\eta (\mathrm{err}_{t|t-h}-\alpha)}_{\mathrm{P}}+\underbrace{r_t\Bigg(\sum_{i=h+1}^t (\mathrm{err}_{i|i-h}-\alpha)\Bigg)}_{\mathrm{I}}+\underbrace{\hat{s}_{t+h|t}}_{\mathrm{D}},
}\end{equation} where \(\eta > 0\) is a constant learning rate, and
\(r_t\) is a saturation function that adheres to the following
conditions\pagebreak[1]\begin{equation}\phantomsection\label{eq-saturation_h}{
x \geq c \cdot g(t-h) \Longrightarrow r_t(x) \geq b, \quad \text {and} \quad x \leq-c \cdot g(t-h) \Longrightarrow r_t(x) \leq -b,
}\end{equation} for constant \(b, c > 0\), and an admissible function
\(g\) that is sublinear, nonnegative, and nondecreasing. With this
updating equation, we can obtain all required \(h\)-step-ahead
prediction intervals using information available at time \(t\). When
\(h=1\), Equation \eqref{eq-mpid} simplifies to the PID update, which
guarantees long-run coverage. More importantly, Equation \eqref{eq-mpid}
represents a specific instance of Equation \eqref{eq-acmcp_1} that we
will introduce later, thereby ensuring long-run coverage for each
individual forecast horizon \(h\) according to
Proposition~\ref{prp-cov_acmcp}.

The P control in MPID shows a delayed correction of the quantile
estimate for a length of \(h\) periods. The underlying intuition is
similar to that of MACP: it increases (or decreases) the
\(h\)-step-ahead quantile estimate if the prediction set at time \(t\)
miscovered (or covered) the corresponding realisation. MACP can be
considered as a special case of the P control, while the P control has
the ability to prevent the generation of null or infinite prediction
sets after a sequence of miscoverage events.

The I control accounts for the cumulative historical coverage errors
associated with \(h\)-step-ahead prediction intervals during the update
process, thereby enhancing the stability of the interval coverage.

The D control involves \(\hat{s}_{t+h|t}\) as the \(h\)-step-ahead
forecast of the nonconformity score (i.e., the forecast error), produced
by any suitable forecasting model (or ``scorecaster'') trained using the
\(h\)-step-ahead nonconformity scores available up to and including time
\(t\). This module provides additional benefits only when the
scorecaster is well-designed and there are predictable patterns in the
nonconformity scores; otherwise, it may increase variability in the
coverage and prediction intervals.

\section{Autocorrelated multi-step conformal
prediction}\label{sec-acmcp}

In the PID method proposed by \citet{angelopoulos2024}, a notable
feature is the inclusion of a scorecaster, a model trained on the score
sequence to forecast the future score. The rationale behind it is to
identify any leftover signal in the score distribution not captured by
the base forecasting model. While this is appropriate in the context of
huge data sets and potentially weak learners, it is unlikely to be a
useful strategy for time series models. We expect to use a forecasting
model that leaves only white noise in the residuals (equivalent to the
one-step nonconformity scores). Moreover, the inclusion of a scorecaster
often only introduces variance to the quantile estimate, resulting in
inefficient (wider) prediction intervals.

On the other hand, in any non-trivial context, multi-step forecasts are
correlated with each other. Specifically, the \(h\)-step-ahead forecast
errors \(e_{t+h|t}\) will be correlated with the forecast errors from
the past \(h-1\) steps, as forecast errors accumulate over the forecast
horizon. However, no conformal prediction methods have taken this
potential dependence into account in their methodological construction.

In this section, we will explore the properties of multi-step forecast
errors and propose a novel conformal prediction method that accounts for
their autocorrelations.

\subsection{Properties of multi-step forecast errors}\label{sec-ppt}

We assume that a time series \(\{y_t\}_{t \geq 1}\) is generated by a
general non-stationary autoregressive process given by:
\begin{equation}\phantomsection\label{eq-dgp}{
y_t = f_{t}(y_{(t-d):(t-1)},\bm{x}_{(t-k):t}) + \varepsilon_t,
}\end{equation} where \(f_{t}\) is a nonlinear function in \(d\) lagged
values of \(y_t\), the current value of the exogenous predictors, along
with the preceding \(k\) values, and \(\varepsilon_t\) is white noise
innovation with mean zero and constant variance. The sequence of model
coefficients that parameterizes the function \(f\) is restricted to
ensure that the stochastic process is locally stationary.

\begin{proposition}[Autocorrelations of multi-step optimal forecast
errors]\protect\hypertarget{prp-ar}{}\label{prp-ar}

Let \(\{y_t\}_{t \geq 1}\) be a time series generated by a general
non-stationary autoregressive process as given in Equation
\eqref{eq-dgp}, and assume that any exogenous predictors are known into
the future. The forecast errors for optimal \(h\)-step-ahead forecasts
can be approximately expressed as
\begin{equation}\phantomsection\label{eq-ar}{
e_{t+h|t} = \omega_{t+h} + \phi_1e_{t+h-1|t} + \cdots + \phi_{p}e_{t+h-p|t},
}\end{equation} where \(p=\min\{d, h-1\}\), and \(\omega_{t}\) is white
noise. Therefore, the optimal \(h\)-step-ahead forecast errors are at
most serially correlated to lag \((h-1)\).

\end{proposition}

Proposition~\ref{prp-ar} suggests that the optimal \(h\)-step ahead
forecast error, \(e_{t+h|t}\), is serially correlated with the forecast
errors from at most the past \(h-1\) steps, i.e.,
\(e_{t+1|t}, \ldots, e_{t+h-1|t}\). However, the autocorrelation among
errors associated with optimal forecasts can not be used to improve
forecasting performance, as it does not incorporate any new information
available when the forecast was made. If we could forecast the forecast
error, then we could improve the forecast, indicating that the initial
forecast couldn't have been optimal.

The proof of Proposition~\ref{prp-ar}, provided in
Section~\ref{sec-proof_ar}, suggests that, if \(f_t\) is a linear
autoregressive model, then the coefficients in Equation \eqref{eq-ar}
are the linear coefficients of the optimal forecasting model. However,
when \(f_t\) takes on a more complex nonlinear structure, the
coefficients become complicated functions of observed data and
unobserved model coefficients.

It is well-established in the forecasting literature that, for a
zero-mean covariance-stationary time series, the optimal linear
least-squares forecasts have \(h\)-step-ahead errors that are at most
MA\((h-1)\) process \citep{harvey1997, diebold2024}, a property that can
be derived using Wold's representation theorem. Here, we extend the
investigation to time series generated by a general non-stationary
autoregressive process, and provide the corresponding proposition. The
proof of this proposition is provided in Section~\ref{sec-proof_ma}.

\begin{proposition}[MA\((h-1)\) process for \(h\)-step-ahead optimal
forecast errors]\protect\hypertarget{prp-ma}{}\label{prp-ma}

Let \(\{y_t\}_{t \geq 1}\) be a time series generated by a general
non-stationary autoregressive process as given in Equation
\eqref{eq-dgp}, and assume that any exogenous predictors are known into
the future. Then the forecast errors of optimal \(h\)-step-ahead
forecasts follow an approximate MA(\(h-1\)) process
\begin{equation}\phantomsection\label{eq-ma}{
e_{t+h|t} = \omega_{t+h} + \theta_1\omega_{t+h-1} + \cdots + \theta_{h-1}\omega_{t+1}.
}\end{equation} where \(\omega_{t}\) is white noise.

\end{proposition}

\subsection{The AcMCP method}\label{sec-novel}

We can exploit these properties of multi-step forecast errors, leading
to a new method that we call the \textbf{autocorrelated multi-step
conformal prediction} (AcMCP) method. Unlike extensions of existing
conformal prediction methods, which treat multi-step forecasting as
independent events (see Section~\ref{sec-ext}), the AcMCP method
integrates the autocorrelations inherent in multi-step forecast errors,
thereby making the output multi-step prediction intervals more
statistically efficient.

The AcMCP method updates the quantile estimate \(q_t\) in an online
setting to achieve the goal of long-run coverage. Specifically, the
iteration of the \(h\)-step-ahead quantile estimate is given by
\begin{equation}\phantomsection\label{eq-acmcp}{
q_{t+h|t}=q_{t+h-1|t-1}+\eta (\mathrm{err}_{t|t-h}-\alpha)+r_t\Bigg(\sum_{i=h+1}^t (\mathrm{err}_{i|i-h}-\alpha)\Bigg)+\tilde{e}_{t+h|t},
}\end{equation} for \(h\in[H]\). Obviously, the AcMCP method can be
viewed as a further extension of the PID method. Nevertheless, AcMCP
diverges from PID with several innovations and differences.

First, we are no longer confined to predicting just one step ahead.
Instead, we can make multi-step forecasts with accompanying theoretical
coverage guarantees, constructing distribution-free prediction intervals
for steps \(t+1,\ldots,t+H\) based on available information up to time
\(t\).

Additionally, in AcMCP, \(\tilde{e}_{t+h|t}\) is a forecast combination
of two simple models: one being an MA\((h-1)\) model trained on the
\(h\)-step-ahead forecast errors available up to and including time
\(t\) (i.e.~\(e_{1+h|1}, \ldots, e_{t|t-h}\)), and the other a linear
regression model trained by regressing \(e_{t+h|t}\) on forecast errors
from past steps (i.e.~\(e_{t+h-1|t}, \ldots, e_{t+1|t}\)). Thus, we
perform multi-step conformal prediction recursively, contrasting with
the independent approach employed in MPID. Moreover, the inclusion of
\(\tilde{e}_{t+h|t}\) is not intended to forecast the nonconformity
scores (i.e., forecast errors), but rather to incorporate
autocorrelations present in multi-step forecast errors within the
resulting multi-step prediction intervals.

\subsection{Coverage guarantees}\label{coverage-guarantees}

\begin{proposition}[]\protect\hypertarget{prp-cov_rt}{}\label{prp-cov_rt}

Let \(\{s_{t+h|t}\}_{t\in\mathbb{N}}\) be any sequence of numbers in
\([-b, b]\) for any \(h\in[H]\), where \(b>0\), and may be infinite.
Assume that \(r_t\) is a saturation function obeying Equation
\eqref{eq-saturation_h}, for an admissible function \(g\). Then the
iteration
\(q_{t+h|t}=r_t\left(\sum_{i=h+1}^t(\mathrm{err}_{i|i-h}-\alpha)\right)\)
satisfies \begin{equation}\phantomsection\label{eq-cov_rt}{
\left|\frac{1}{T-h}\sum_{t=h+1}^{T}(\mathrm{err}_{t|t-h}-\alpha)\right| \leq \frac{c \cdot g(T-h) + h}{T-h},
}\end{equation} for any \(T \geq h+1\), where \(c>0\) is the constant in
Equation \eqref{eq-saturation_h}.

Therefore the prediction intervals obtained by the iteration yield the
correct long-run coverage; i.e.,
\(\lim _{T \rightarrow \infty} \frac{1}{T-h} \sum_{t=h+1}^T \mathrm{err}_{t|t-h} = \alpha\).

\end{proposition}

Proposition~\ref{prp-cov_rt} indicates that, for \(t \ll \infty\),
increasing the forecast horizon \(h\) tends to amplify deviations from
the target coverage because \(g(T-h)/(t-h)\) is non-increasing, given
that the admissible function \(g\) is sublinear, nonnegative, and
nondecreasing. This is consistent with expectations, as extending the
forecast horizon generally increases forecast uncertainty. As
predictions extend further into the future, more factors contribute to
variability and uncertainty. In this case, conformal prediction
intervals may not scale perfectly with the increasing uncertainty,
leading to a larger discrepancy between the desired and actual coverage.

The quantile iteration
\(q_{t+h|t}=q_{t+h-1|t-1}+\eta (\mathrm{err}_{t|t-h}-\alpha)\) can be
seen as a particular instance of the iteration outlined in
Proposition~\ref{prp-cov_rt} if we set \(q_{2h|h}=0\) without losing
generality. Thus, its coverage bounds can be easily derived as a result
of Proposition~\ref{prp-cov_rt}.

\begin{proposition}[]\protect\hypertarget{prp-cov_qt}{}\label{prp-cov_qt}

Let \(\{s_{t+h|t}\}_{t\in\mathbb{N}}\) be any sequence of numbers in
\([-b, b]\) for any \(h\in[H]\), where \(b>0\), and may be infinite.
Then the iteration
\(q_{t+h|t}=q_{t+h-1|t-1}+\eta (\mathrm{err}_{t|t-h}-\alpha)\) satisfies
\[
\left|\frac{1}{T-h}\sum_{t=h+1}^{T}(\mathrm{err}_{t|t-h}-\alpha)\right| \leq \frac{b + \eta h}{\eta(T-h)},
\] for any learning rate \(\eta > 0\) and \(T \geq h+1\).

Therefore the prediction intervals obtained by the iteration yield the
correct long-run coverage; i.e.,
\(\lim _{T \rightarrow \infty} \frac{1}{T-h} \sum_{t=h+1}^T \mathrm{err}_{t|t-h} = \alpha\).

\end{proposition}

More importantly, Proposition~\ref{prp-cov_rt} is also adequate for
establishing the coverage guarantee of the proposed AcMCP method given
byEquation \eqref{eq-acmcp}. Following \citet{angelopoulos2024}, we
first reformulate Equation \eqref{eq-acmcp} as
\begin{equation}\phantomsection\label{eq-acmcp_1}{
q_{t+h|t}=\hat{q}_{t+h|t}+r_t\left(\sum_{i=h+1}^t (\mathrm{err}_{i|i-h}-\alpha)\right),
}\end{equation} where \(\hat{q}_{t+h|t}\) can be any function of the
past observations \(\{(\bm{x}_i, y_i)\}_{1 \leq i \leq t}\) and quantile
estimates \(q_{i+h|i}\) for \(i \leq t-1\). Taking
\(\hat{q}_{t+h|t}=q_{t+h-1|t-1}+\eta (\mathrm{err}_{t|t-h}-\alpha)+\tilde{e}_{t+h|t}\)
will recover Equation \eqref{eq-acmcp}. We can consider
\(\hat{q}_{t+h|t}\) as the forecast of the quantile \(q_{t+h|t}\) based
on available historical data. We then present the coverage guarantee for
AcMCP given by Equation \eqref{eq-acmcp_1}.

\begin{proposition}[]\protect\hypertarget{prp-cov_acmcp}{}\label{prp-cov_acmcp}

Let \(\{\hat{q}_{t+h|t}\}_{t\in\mathbb{N}}\) be any sequence of numbers
in \([-\frac{b}{2}, \frac{b}{2}]\), and
\(\{s_{t+h|t}\}_{t\in\mathbb{N}}\) be any sequence of numbers in
\([-\frac{b}{2},\frac{b}{2}]\), for any \(h\in[H]\), \(b>0\) and may be
infinite. Assume that \(r_t\) is a saturation function obeying Equation
\eqref{eq-saturation_h}, for an admissible function \(g\). Then the
prediction intervals obtained by the AcMCP iteration given by Equation
\eqref{eq-acmcp_1} yield the correct long-run coverage; i.e.,
\(\lim _{T \rightarrow \infty} \frac{1}{T-h} \sum_{t=h+1}^T \mathrm{err}_{t|t-h} = \alpha\).

\end{proposition}

\section{Experiments}\label{experiments}

We examine the empirical performance of the proposed multi-step
conformal prediction methods using two simulated data settings and two
real data examples. Throughout the experiments, we adhere to the
following parameter settings:

\begin{enumerate}
\def\labelenumi{(\alph{enumi})}
\tightlist
\item
  we focus on the target coverage level \(1-\alpha=0.9\);
\item
  for the MWCP method, we use \(b=0.99\) as per \citet{barber2023};
\item
  following \citet{angelopoulos2024}, we use a step size parameter of
  \(\gamma=0.005\) for the MACP method, a Theta model as the scorecaster
  in the MPID method, and a learning rate of \(\eta=0.01\hat{B}_t\) for
  quantile tracking in the MPID and AcMCP methods, where
  \(\hat{B}_t=\max\{s_{t-\Delta+1|t-\Delta-h+1},\dots,s_{t|t-h}\}\) is
  the highest score over a tailing window and the window length
  \(\Delta\) is set to be same as the length of the calibration set;
\item
  we adopt a nonlinear saturation function defined as
  \(r_t(x)=K_1 \tan \left(x \log (t) / (t C_{\text {sat }})\right)\),
  where \(\tan (x)=\operatorname{sign}(x) \cdot \infty\) for
  \(x \notin[-\pi / 2, \pi / 2]\), and constants
  \(C_{\text {sat }}, K_{\mathrm{I}}>0\) are chosen heuristically, as
  suggested by \citet{angelopoulos2024};
\item
  we consider a clipped version of MACP by imputing infinite intervals
  with the largest score seen so far.
\end{enumerate}

\subsection{Simulated linear autoregressive
process}\label{simulated-linear-autoregressive-process}

We first consider a simulated stationary time series which is generated
from a simple AR\((2)\) process \[
y_t = 0.8y_{t-1} - 0.5y_{t-2} + \varepsilon_t,
\] where \(\varepsilon_t\) is white noise with error variance
\(\sigma^2 = 1\). After an appropriate burn-in period, we generate
\(N=5000\) data points. Under the sequential split and online learning
settings, we create training sets \(\mathcal{D}_{\text{tr}}\) and
calibration sets \(\mathcal{D}_{\text{cal}}\), each with a length of
\(500\). We use AR\((2)\) models to generate \(1\)- to \(3\)-step-ahead
point forecasts (i.e.~\(H=3\)), using the \texttt{Arima()} function from
the \texttt{forecast} R package \citep{hyndman2024}. The goal is to
generate prediction intervals using various proposed conformal
prediction methods and evaluate whether they can achieve the nominal
long-run coverage for each separate forecast horizon.

\begin{figure}

\centering{

\includegraphics{cpts_files/figure-pdf/fig-AR2_cov-1.pdf}

}

\caption{\label{fig-AR2_cov}AR(2) simulation results showing rolling
coverage, mean and median interval width for each forecast horizon. The
displayed curves are smoothed over a rolling window of size \(500\). The
black dashed line indicates the target level of \(1-\alpha=0.9\).}

\end{figure}%

Figure~\ref{fig-AR2_cov} presents the rolling coverage and interval
width of each method for each forecast horizon, with metrics computed
over a rolling window of size \(500\). The analytic (and optimal)
intervals obtained from the AR model are also shown. We observe that
MPID and AcMCP achieve approximately the desired \(90\%\) coverage level
over the rolling windows, while other methods, including the AR model,
undergo much wider swings away from the desired level, showing high
coverage volatility over time. The trajectories of the rolling mean and
median of the interval widths for each method are largely consistent.
AcMCP constructs narrower prediction intervals than MPID, despite both
methods achieving similar coverage. Moreover, we see that AcMCP tends to
offer adaptive prediction intervals, and results in wider intervals
especially when competing methods undercover, which is to be expected.
In short, AcMCP intervals offer greater adaptivity and more precise
coverage compared to AR, MSCP, MWCP and MACP. However, MPID achieves
tight coverage but at the cost of constructing wider prediction
intervals. This is due to the inclusion of a second model (scorecaster)
which introduces additional variance into the generated prediction
intervals. The results can be further elucidated with
Figure~\ref{fig-AR2_box}, which presents boxplots of rolling coverage
and interval width for each method and each forecast horizon.

\begin{figure}

\centering{

\includegraphics{cpts_files/figure-pdf/fig-AR2_box-1.pdf}

}

\caption{\label{fig-AR2_box}AR(2) simulation results showing boxplots of
the rolling coverage and interval width for each method across different
forecast horizons. The red dashed lines show the target coverage level,
while the blue dashed lines indicate the median interval width of the
AcMCP method.}

\end{figure}%

\begin{figure}[!b]

\centering{

\includegraphics{cpts_files/figure-pdf/fig-AR2_timeplot-1.pdf}

}

\caption{\label{fig-AR2_timeplot}AR(2) simulation results showing the
prediction interval bounds for the MACP, MPI, and AcMCP methods over a
truncated period of length 500.}

\end{figure}%

The inclusion of the last term \(\tilde{e}_{t+h|t}\) in AcMCP should
only result in a slight difference compared to the version without this
term, which we henceforth refer to as MPI. This is because, the
inclusion of \(\tilde{e}_{t+h|t}\) aims to capture autocorrelations
inherent in multi-step forecast errors and focuses on the mean of
forecast errors, whereas the whole update of AcMCP operates on quantiles
of scores. To illustrate the subtle difference in their results and
explore their origins, we visualize their prediction intervals over a
truncated period of length \(500\), as shown in
Figure~\ref{fig-AR2_timeplot}. We observe that AcMCP and MPI indeed
construct similar prediction intervals so their lower and upper bounds
mostly overlap with each other. The main differences occur around the
time 1320 and during the period 1470-1500, where AcMCP tends to have a
fanning-out effect, increasing the interval width as the forecast
horizon increases, compared to MPI. Figure~\ref{fig-AR2_timeplot} also
presents the prediction interval bounds given by MACP. The prediction
intervals of both AcMCP and MACP can capture certain patterns in the
actual observations, and there is no consistent pattern indicating
dominance of one method over the other in terms of interval width.

\subsection{Simulated nonlinear autoregressive
process}\label{simulated-nonlinear-autoregressive-process}

Next consider the case of a nonlinear data generation process, defined
as \[
y_t = \sin(y_{t-1}) + 0.5\log(y_{t-2} + 1) + 0.1y_{t-1}x_{1,t} + 0.3x_{2,t} + \varepsilon_{t},
\] where \(x_{1,t}\) and \(x_{2,t}\) are uniformly distributed on
\([0,1]\), and \(\varepsilon_{t}\) is white noise with error variance
\(\sigma^2 = 0.1\). Thus, the time series \(y_t\) nonlinearly depends on
its lagged values \(y_{t-1}\) and \(y_{t-2}\), as well as exogenous
variables \(x_{1,t}\) and \(x_{2,t}\).

\begin{figure}[!b]

\centering{

\includegraphics{cpts_files/figure-pdf/fig-NL_cov-1.pdf}

}

\caption{\label{fig-NL_cov}Nonlinear simulation results showing rolling
coverage, mean and median interval width for each forecast horizon. The
displayed curves are smoothed over a rolling window of size \(100\). The
black dashed line indicates the target level of \(1-\alpha=0.9\).}

\end{figure}%

\begin{figure}[!b]

\centering{

\includegraphics{cpts_files/figure-pdf/fig-NL_box-1.pdf}

}

\caption{\label{fig-NL_box}Nonlinear simulation results showing boxplots
of the rolling coverage and interval width for each method across
different forecast horizons. The red dashed lines show the target
coverage level, while the blue dashed lines indicate the median interval
width of the AcMCP method.}

\end{figure}%

After an appropriate burn-in period, we generate \(N=2000\) data points.
Under the sequential split and online learning settings, we create
training sets \(\mathcal{D}_{\text{tr}}\) and calibration sets
\(\mathcal{D}_{\text{cal}}\), each with a length of \(500\). Given the
nonlinear structure of the DGP, we use feed-forward neural networks with
a single hidden layer and lagged inputs to generate \(1\)- to
\(3\)-step-ahead point forecasts (i.e.~\(H=3\)), using the
\texttt{nnetar()} function from the \texttt{forecast} R package
\citep{hyndman2024}. Note that it is not possible to derive analytic
intervals for neural networks, thus we do not include neural network
models when presenting the results.

Figure~\ref{fig-NL_cov} illustrates the rolling coverage and interval
width of each method, with calculations based on a rolling window of
size \(100\). We see that MPID and AcMCP are able to maintain minor
fluctuations around the target coverage of \(90\%\) across all time
indices, contrasting with MSCP, MWCP, and MACP, which struggle to
sustain the target coverage and display pronounced fluctuations over
time. Moreover, all conformal prediction methods, except for MSCP,
construct adaptive prediction intervals. They widen intervals in
response to undercoverage and narrow them when overcoverage occurs.
Notably, MPID and AcMCP demonstrate greater adaptability, displaying
higher variability in interval widths compared to competing methods in
order to uphold the desired coverage. AcMCP intervals are evidently
narrower than MPID intervals for \(2\)-step-ahead forecasting but wider
for \(3\)-step-ahead forecasting. AcMCP intervals appear to be more
reasonable, as they tend to widen with increasing forecast horizons.

We provide further insights into the performance of these conformal
prediction methods by presenting boxplots of the rolling coverage and
interval width for each method, as depicted in Figure~\ref{fig-NL_box}.
We observe that coverage variability is higher for MSCP, MWCP and MACP
than for MPID and AcMCP, while MPID and AcMCP lead to a lower effective
interval size.

\subsection{Electricity demand data}\label{electricity-demand-data}

We apply the conformal prediction methods using data comprising daily
electricity demand (GW), daily maximum temperature (degrees Celsius),
and holiday information for Victoria, Australia, spanning a three-year
period from 2012 to 2014. Temperatures are taken from the Melbourne
Regional Office weather station in the capital city of Victoria. The
left panel of Figure~\ref{fig-elec_data} displays the daily electricity
demand during 2012--2014, along with temperatures. The right panel shows
a nonlinear relationship between electricity demand and temperature,
with demand increasing for low temperatures (due to heating) and
increasing for high temperatures (due to cooling). The two clouds of
points in the right panel correspond to working days and non-working
days.

\begin{figure}[!hb]

\centering{

\includegraphics{cpts_files/figure-pdf/fig-elec_data-1.pdf}

}

\caption{\label{fig-elec_data}Daily electricity demand and corresponding
daily maximum temperatures in 2012--2014, Victoria, Australia.}

\end{figure}%

Our response variable is \(\texttt{Demand}\), and we use two covariates:
\(\texttt{Temperature}\), and \(\texttt{Workday}\) (an indicator
variable for if the day was a working day or not). Following
\citet{hyndman2021}, we will fit a dynamic regression model with a
piecewise linear function of temperature (containing a knot at \(18\)
degrees) to generate \(1\)- to \(7\)-step-ahead point forecasts
(i.e.~\(H=7\)). The error series in the regression is assumed to follow
an ARIMA model to contain autocorrelation. We use two years of data as
training sets to fit dynamic regression models, and use \(100\) data
points for calibration sets.

\begin{figure}[!hbtp]

\centering{

\includegraphics{cpts_files/figure-pdf/fig-elec_cov-1.pdf}

}

\caption{\label{fig-elec_cov}Electricity demand data results showing
rolling coverage, mean and median interval width for each forecast
horizon. The displayed curves are smoothed over a rolling window of size
\(100\). The black dashed line indicates the target level of
\(1-\alpha=0.9\).}

\end{figure}%

\begin{figure}[!hbtp]

\centering{

\includegraphics{cpts_files/figure-pdf/fig-elec_box-1.pdf}

}

\caption{\label{fig-elec_box}Electricity demand data results showing
boxplots of the rolling coverage and interval width for each method
across different forecast horizons. The red dashed lines show the target
coverage level, while the blue dashed lines indicate the median interval
width of the AcMCP method.}

\end{figure}%

\begin{figure}

\centering{

\includegraphics{cpts_files/figure-pdf/fig-elec_timeplot-1.pdf}

}

\caption{\label{fig-elec_timeplot}Electricity demand data results
showing the forecast interval bounds for MACP, MPI, and AcMCP over the
whole test set.}

\end{figure}%

We present the results in Figure~\ref{fig-elec_cov} and
Figure~\ref{fig-elec_box}, comparing the rolling coverage and interval
width of each method. These computations are based on a rolling window
of size \(100\). The DR method corresponds to the analytic intervals
obtained from the dynamic regression model. First, we observe that DR
consistently achieves a significantly higher coverage than the \(90\%\)
target coverage, resulting in much wider intervals than other methods
for \(h=1,2,3,4\). Second, MSCP, MWCP, and MACP fail to sustain the
target coverage and noticeably undercover after September 2014 for all
forecast horizons, thus leading to narrower intervals than others.
Third, MPI, MPID, and AcMCP offer prediction intervals that are wider
than those of other conformal prediction methods, effectively mitigating
or avoiding the undercoverage issue observed after September 2014.
Additionally, we notice that MPID performs slightly worse than MPI and
AcMCP in terms of coverage for \(h=3,4,7\), despite leading to wider
intervals. Finally, MPI and AcMCP coverage display similar pattern, but
AcMCP is capable of constructing narrower intervals than MPI.

We present the forecast interval bounds for MACP, MPI, and AcMCP in
Figure~\ref{fig-elec_timeplot}. The plot shows that MACP intervals are
too narrow to adequately hug the true values, particularly from
September to November 2014. In contrast, MPI and AcMCP perform better by
widening their intervals, resulting in narrower swings away from the
\(90\%\) target level. The differences between MPI and AcMCP prediction
intervals are most pronounced in the \(5\)-, \(6\)-, and
\(7\)-step-ahead forecast results. For \(5\)-step-ahead forecasting,
from May to July 2014, the upper bounds of MPI intervals struggle to
capture the true values. AcMCP addresses this undercoverage by construct
larger upper bounds. In August and September, AcMCP constructs smaller
upper bounds than MPI while still capturing the true values effectively.
For \(6\)-step-ahead forecasting from May to July, AcMCP offers smaller
upper bounds than MPI, which provides upper bounds that are far away
from the truth. Similar reaction is observed for \(7\)-step-ahead
forecasting during August and September.

\subsection{Eating out expenditure
data}\label{eating-out-expenditure-data}

In our final example, we apply the conformal prediction methods to
forecast the eating out expenditure (\$million) in Victoria, Australia.
The data set includes monthly expenditure on cafes, restaurants and
takeaway food services in Victoria from April 1982 up to December 2019,
as shown in Figure~\ref{fig-cafe_data}. The data shows an overall upward
trend, obvious annual seasonal patterns, and variability proportional to
the data level.

\begin{figure}[!tb]

\centering{

\includegraphics{cpts_files/figure-pdf/fig-cafe_data-1.pdf}

}

\caption{\label{fig-cafe_data}Monthly expenditure on cafes, restaurants
and takeaway food services in Victoria, Australia, from April 1982 to
December 2019.}

\end{figure}%

We consider three models: ARIMA with logarithmic transformation, ETS,
and STL-ETS \citep{hyndman2021}, and then output their simple average as
final point forecasts. STL-ETS involves forecasting using the STL
decomposition method, applying ETS to forecast the seasonally adjusted
series. All three models can be automatically trained using the
\texttt{forecast} R package \citep{hyndman2024}. Our goal is to forecast
\(12\) months ahead, i.e.~\(H=12\). We use \(20\) years of data for
training the models and \(5\) years of data for calibration sets. The
whole test set only has a length of \(152\) months. Therefore, we will
not compute rolling coverage and interval width in this experiment, but
rather compute the coverage and interval width averaged over the entire
test set.

We summarise the average coverage and interval width for each method and
each forecast horizon in Figure~\ref{fig-cafe_cov}. The results first
show that MSCP, MWCP and MACP provide valid prediction intervals for
smaller forecast horizon but fail to achieve the desired coverage for
larger forecast horizons (\(h>5\)). Second, for \(h \leq 5\), MPI and
AcMCP can approximately achieve the desired coverage and provide
comparable mean interval widths with other methods, except for MPID.
Third, the coverage of MSCP, MWCP and MACP declines gradually as the
forecast horizon increases, while MPI and AcMCP maintain coverage within
a tighter range, albeit at the cost of interval efficiency. Lastly,
compared to MPI, AcMCP exhibits slightly less deviation from the desired
coverage across most forecast horizons.

\begin{figure}

\centering{

\includegraphics{cpts_files/figure-pdf/fig-cafe_cov-1.pdf}

}

\caption{\label{fig-cafe_cov}Eating out expenditure data results showing
coverage gap and interval width averaged over the entire test set for
each forecast horizon. The black dashed line in the top panel indicates
no difference from the \(90\%\) target level.}

\end{figure}%

\section{Conclusion and discussion}\label{conclusion-and-discussion}

We have introduced a unified notation to formalize the mathematical
representation of conformal prediction within the context of time series
data, with a particular focus on multi-step time series forecasting in a
generic online learning framework.

Several accessible conformal prediction methods have been extended to
address the challenges of multi-step forecasting scenarios.

We have shown that the optimal \(h\)-step-ahead forecast errors can be
approximated by a linear combination of at most its lag \((h-1)\) with
respect to forecast horizon, under the assumption of a general
non-stationary autoregressive DGP. Building on this foundation, we
introduce a novel method, AcMCP, which accounts for the autocorrelations
inherent in multi-step forecast errors. Our method achieves long-run
coverage guarantees without imposing assumptions regarding data
distribution shifts. In both simulations and applications to data, our
proposed method achieves coverage closer to the target within local
windows while offering adaptive prediction intervals that adjust
effectively to varying conditions.

The methods discussed here are limited to ex-post forecasting, operating
under the assumption that actual values of the exogenous predictors
during the forecast period are available. In many applications, this may
not be true and the methods will need to be adapted to allow forecasts
of the predictors to be used, and the resulting uncertainty to be
included in the conformal inference. Additionally, our methodology does
not incorporate an algorithmic approach to tuning the learning rate
parameter.

These considerations pave the way for numerous avenues for future
research. Potential directions include the introduction of a
time-dependent learning rate parameter to minimize interval width while
maintaining the guaranteed coverage rate, as well as the development of
refined methodologies that account for variability introduced by
forecasting predictors in ex-ante scenarios.

% \newpage
\appendix
% \pagenumbering{arabic}% resets `page` counter to 1
\setcounter{section}{0}
\renewcommand{\thesection}{Appendix \Alph{section}}
\renewcommand{\thesubsection}{\Alph{section}.\arabic{subsection}}
% \renewcommand{\thefigure}{A\arabic{figure}}
% \renewcommand{\thetable}{A\arabic{table}}
% \setcounter{figure}{0}
% \setcounter{table}{0}

\section{Proofs}\label{sec-proof}

\subsection{\texorpdfstring{Proof of
Proposition~\ref{prp-ar}}{Proof of Proposition~}}\label{sec-proof_ar}

\begin{proof}
Considering the time series \(\{y_t\}_{t\geq1}\) generated by a locally
stationary autoregressive process as defined in Equation \eqref{eq-dgp}.
Let \(\hat{y}_{t+h|t}\) be the optimal \(h\)-step-ahead point forecast
generated by a well-trained model \(\hat{f}\), using information
available up to time \(t\), and \(e_{t+h|t}\) be the corresponding
optimal \(h\)-step-ahead forecast error. Denote that
\(\bm{u}_{t+h}=\bm{x}_{(t-k+h):(t+h)}\). Then we have \begin{align*}
\hat{y}_{t+h|t}=\begin{cases}
      \hat{f}(y_t,\dots,y_{t-d+1},\bm{u}_{t+1}) & \text{ if } h=1, \\
      \hat{f}(\hat{y}_{t+h-1|t},\dots,\hat{y}_{t+1|t},y_t,\dots,y_{t+h-d},\bm{u}_{t+h}) &  \text{ if } 1 < h \leq d, \\
      \hat{f}(\hat{y}_{t+h-1|t},\dots,\hat{y}_{t+h-d|t},\bm{u}_{t+h}) & \text{ if } h > d.\\
    \end{cases}
\end{align*} For \(h=1\), we simply have \(e_{t+1|t} = \omega_{t+1}\),
where \(\omega_t\) is a white noise series. This follows from the
well-established property that optimal forecasts have \(1\)-step-ahead
errors that are white noise.

For \(1<h\leq d\), applying the first order Taylor series expansion, we
can write \begin{align*}
y_{t+h}
&= \hat{f}(y_{t+h-1},\dots,y_{t+h-d},\bm{u}_{t+h})+\omega_{t+h} \\
&= \hat{f}(\hat{y}_{t+h-1|t}+e_{t+h-1|t},\dots,\hat{y}_{t+1|t}+e_{t+1|t},y_{t},\dots,y_{t+h-d},\bm{u}_{t+h})+\omega_{t+h} \\
&\underset{\text{te}}{\approx} \hat{f}(\bm{a})+\operatorname{D}\hat{f}(\bm{a})(\bm{v}-\bm{a})+
\omega_{t+h} \\
&= \hat{f}(\hat{y}_{t+h-1|t},\dots,\hat{y}_{t+1|t},y_{t},\dots,y_{t+h-d},\bm{u}_{t+h}) \\
&\mbox{}\qquad +e_{t+h-1|t}\frac{\partial \hat{f}(\bm{a})}{\partial v_1}+\dots+e_{t+2|t}\frac{\partial \hat{f}(\bm{a})}{\partial v_{h-2}}+e_{t+1|t}\frac{\partial \hat{f}(\bm{a})}{\partial v_{h-1}}+\omega_{t+h} \\
&=\hat{y}_{t+h|t}+e_{t+h|t},
\end{align*} where \(\bm{v}=(y_{t+h-1},\dots,y_{t+h-d},\bm{u}_{t+h})\),
\(\bm{a} =(\hat{y}_{t+h-1|t},\dots,\hat{y}_{t+1|t},y_{t},\dots,y_{t+h-d},\bm{u}_{t+h})\),
\(\operatorname{D}\hat{f}(\bm{a})\) denotes the matrix of partial
derivative of \(\hat{f}(\bm{v})\) at \(\bm{v}=\bm{a}\), and
\(\frac{\partial}{\partial v_i}\) denotes the partial derivative with
respect to the \(i\)th component in \(\hat{f}\).

Similarly, for \(h > d\), we can write \begin{align*}
y_{t+h}
&= \hat{f}(y_{t+h-1},\dots,y_{t+h-d},\bm{u}_{t+h})+\omega_{t+h} \\
&= \hat{f}(\hat{y}_{t+h-1|t}+e_{t+h-1|t},\dots,\hat{y}_{t+h-d|t}+e_{t+h-d|t},\bm{u}_{t+h})+\omega_{t+h} \\
&\underset{\text{te}}{\approx} \hat{f}(\bm{a})+\operatorname{D}\hat{f}(\bm{a})(\bm{v}-\bm{a})+
\omega_{t+h} \\
&= \hat{f}(\hat{y}_{t+h-1|t},\dots,\hat{y}_{t+h-d|t},\bm{u}_{t+h}) \\
&\mbox{}\qquad +e_{t+h-1|t}\frac{\partial \hat{f}(\bm{a})}{\partial v_1}+e_{t+h-d|t}\frac{\partial \hat{f}(\bm{a})}{\partial v_{d}}+\omega_{t+h} \\
&= \hat{y}_{t+h|t}+e_{t+h|t},
\end{align*} Therefore, the forecast errors of optimal \(h\)-step-ahead
forecasts follow an approximate AR(\(p\)) process, where
\(p=\min\{d, h-1\}\). This implies that the optimal \(h\)-step-ahead
forecast errors are at most serially correlated to lag \((h-1)\).
\end{proof}

\subsection{\texorpdfstring{Proof of
Proposition~\ref{prp-ma}}{Proof of Proposition~}}\label{sec-proof_ma}

\begin{proof}
Here, we give the proof of Proposition~\ref{prp-ma} based on
Proposition~\ref{prp-ar}; the idea is motivated by \citet{sommer2023}.

Based on Proposition~\ref{prp-ar} and its proof, we have \begin{align*}
e_{t+1|t} &= \omega_{t+1} \\
e_{t+2|t} &= \omega_{t+2} + \phi_{1}^{(2)}e_{t+1|t} \\
e_{t+3|t} &= \omega_{t+3} + \phi_{1}^{(3)}e_{t+2|t} + \phi_{2}^{(3)}e_{t+1|t} \\
\cdots \\
e_{t+h|t} & = \omega_{t+h} + \phi_{1}^{(h)}e_{t+h-1|t} + \cdots + \phi_{p}^{(h)}e_{t+h-p|t}, \text{ with } p = \min\{d, h-1\},
\end{align*} where \(\omega_t\) is a white noise series,
\(\phi_i^{(j)}\) denotes the coefficient for the lag \(i\) term in the
AR model of order \(\min\{d, j-1\}\) for the forecast error
\(e_{t+j|t}\) and here the AR model is applied at the forecast horizon
\(j\), rather than at the time index \(t\).

Substituting all equations above into the following equation, we can
obtain \[
e_{t+h|t} = \omega_{t+h} + \sum_{i=1}^{h-1}\theta_{i}\omega_{t+h-i}, \text{ for each } h\in[H],
\] where \(\theta_{i}\) is a complex combination of \(\phi\) terms from
the previous \(h-1\) equations. So we conclude that the \(h\)-step-ahead
forecast error sequence \(\{e_{t+h|t}\}\) follows an approximate
MA\((h-1)\) process.
\end{proof}

\subsection{\texorpdfstring{Proof of
Proposition~\ref{prp-cov_rt}}{Proof of Proposition~}}\label{sec-proof_cov_rt}

\begin{proof}
Let \(E_T=\sum_{t=h+1}^{T}(\mathrm{err}_{t|t-h}-\alpha)\). The
inequality given by Equation \eqref{eq-cov_rt} can be expressed as
\(|E_T| \leq c \cdot g(T-h) + h\). We will prove one side of the
absolute inequality, specifically \(E_T \leq c \cdot g(T-h) + h\), with
the other side following analogously. We proceed with the proof using
induction.

For \(T=h+1,\ldots,2h\),
\(E_T = \sum_{t=h+1}^{T}(\mathrm{err}_{t|t-h}-\alpha) \leq (T-h)-(T-h)\alpha \leq T-h \leq h \leq cg(T-h) + h\)
as \(c>0\), \(h\geq 1\), \(g\) is nonnegative, and
\(\mathrm{err}_{t|t-h} \leq 1\). Thus, Equation \eqref{eq-cov_rt} holds
for \(T=h+1,\ldots,2h\).

Now, assuming Equation \eqref{eq-cov_rt} is true up to \(T\). We
partition the argument into \(h+1\) cases: \[
\begin{cases}
cg(T-h)+h-1 < E_T \leq cg(T-h)+h, & \quad \text { case (1) } \\
cg(T-h)+h-2 < E_T \leq cg(T-h)+h-1, & \quad \text { case (2) } \\
\qquad \cdots \\
cg(T-h) < E_T \leq cg(T-h)+1, & \quad \text { case (h) } \\
E_T \leq cg(T-h). & \quad \text { case (h+1) }
\end{cases}
\] In case (1), we observe that \(E_T > cg(T-h)+h-1 > cg(T-h)\),
implying \(q_{T+h|T} = r_t(E_{T}) \geq b\) according to Equation
\eqref{eq-saturation_h}. Thus, \(s_{T+h|T} \leq q_{T+h|T}\) and
\(\mathrm{err}_{T+h|T} = 0\). Furthermore, we have
\(E_{T-1} = E_T - (\mathrm{err}_{T|T-h} - \alpha) > cg(T-h)+h-2 > cg(T-h-1)\)
as \(g\) is nondecreasing. This implies
\(q_{T+h-1|T-1} = r_t(E_{T-1}) \geq b\), hence
\(s_{T+h-1|T-1} \leq q_{T+h-1|T-1}\) and
\(\mathrm{err}_{T+h-1|T-1} = 0\). Similarly,
\(E_{T-2} = E_{T-1} - (\mathrm{err}_{T-1|T-h-1} - \alpha) > cg(T-h)+h-3 > cg(T-h-2)\),
thus \(\mathrm{err}_{T+h-2|T-2} = 0\). This process iterates, leading to
\(\mathrm{err}_{T+h|T} = \mathrm{err}_{T+h-1|T-1} = \cdots = \mathrm{err}_{T+1|T-h+1} = 0\).
Consequently, \[
E_{T+h} = E_T+\sum_{t=T+1}^{T+h}(\mathrm{err}_{t|t-h}-\alpha) \leq cg(T-h)+h-h\alpha \leq cg(T)+h,
\] which is the desired result at \(T+h\).

In case (2), we observe that \(E_T > cg(T-h)+h-2 > cg(T-h)\), thus
\(s_{T+h|T} \leq q_{T+h|T}\) and \(\mathrm{err}_{T+h|T} = 0\). Moving
forward, we have
\(\mathrm{err}_{T+h|T} = \mathrm{err}_{T+h-1|T-1} = \cdots = \mathrm{err}_{T+2|T-h+2} = 0\).
Along with \(\mathrm{err}_{T+1|T-h+1} \leq 1\), this means that \[
E_{T+h} = E_T+\sum_{t=T+1}^{T+h}(\mathrm{err}_{t|t-h}-\alpha) \leq cg(T-h)+h-1+1-h\alpha \leq cg(T)+h,
\] which again gives the desired result at \(T+h\).

Similarly, in cases (3)-(h), we can always get the desired result at
\(T+h\).

In case (h+1), noticing \(E_T \leq cg(T-h)\), and simply using
\(\mathrm{err}_{T+h-i|T-i} \leq 1\) for \(i=0,\ldots,h-1\), we have \[
E_{T+h} = E_T+\sum_{t=T+1}^{T+h}(\mathrm{err}_{t|t-h}-\alpha) \leq cg(T-h)+h-h\alpha \leq cg(T)+h.
\] Therefore, we can deduce the desired outcome at any \(T \geq h+1\).
This completes the proof for the first part of
Proposition~\ref{prp-cov_rt}.

Regarding the second part, \(g(t-h)/(t-h) \rightarrow 0\) as
\(t \rightarrow \infty\) due to the sublinearity of the admissible
function \(g\). Hence the second part holds trivially.
\end{proof}

\subsection{\texorpdfstring{Proof of
Proposition~\ref{prp-cov_qt}}{Proof of Proposition~}}\label{sec-proof_cov_qt}

\begin{proof}
We set \(q_{2h|h}=0\) without losing generality, the iteration
\(q_{t+h|t}=q_{t+h-1|t-1}+\eta (\mathrm{err}_{t|t-h}-\alpha)\)
simplifies to
\(q_{t+h|t}=\eta \sum_{i=h+1}^{t}(\mathrm{err}_{i|i-h}-\alpha)\). Let
\(r_t(x) = \eta x\) and the admissible function \(g(t-h) = b\), Equation
\eqref{eq-saturation_h} holds for \(c=\frac{1}{\eta}\). Then
Proposition~\ref{prp-cov_rt} applies and we can easily derive the
desired result.
\end{proof}

\subsection{\texorpdfstring{Proof of
Proposition~\ref{prp-cov_acmcp}}{Proof of Proposition~}}\label{sec-proof_cov_acmcp}

\begin{proof}
Let \(q_{t+h|t}^{*}=q_{t+h|t}-\hat{q}_{t+h|t}\), then Equation
\eqref{eq-acmcp_1} transforms into an update process
\(q_{t+h|t}^{*}=r_t\left(\sum_{i=h+1}^t (\mathrm{err}_{i|i-h}-\alpha)\right)\),
which is an update with respect to \(q_{t+h|t}^{*}\). Under this new
framework, the nonconformity score becomes
\(s_{t+h|t}^{*}=s_{t+h|t}-\hat{q}_{t+h|t}\), with values ranging in
\([-b,b]\), given the assumption that both \(s_{t+h|t}\) and
\(\hat{q}_{t+h|t}\) fall within \([-\frac{b}{2},\frac{b}{2}]\). Thus,
Proposition~\ref{prp-cov_rt} can be directly applied to establish the
long-run coverage achieved by the AcMCP method.
\end{proof}

\section*{Acknowledgments}\label{acknowledgments}
\addcontentsline{toc}{section}{Acknowledgments}

Rob Hyndman and Xiaoqian Wang are members of the Australian Research
Council Industrial Transformation Training Centre in Optimisation
Technologies, Integrated Methodologies, and Applications (OPTIMA),
Project ID IC200100009.~

\section*{References}\label{references}
\addcontentsline{toc}{section}{References}

\renewcommand{\bibsection}{}
\bibliography{references.bib}


\end{document}