-
Notifications
You must be signed in to change notification settings - Fork 1
/
cpts.tex
1587 lines (1373 loc) · 74.3 KB
/
cpts.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% Options for packages loaded elsewhere
\PassOptionsToPackage{unicode}{hyperref}
\PassOptionsToPackage{hyphens}{url}
\PassOptionsToPackage{dvipsnames,svgnames,x11names}{xcolor}
%
\documentclass[
11pt,
12pt]{article}
\usepackage{amsmath,amssymb}
\usepackage{setspace}
\usepackage{iftex}
\ifPDFTeX
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{textcomp} % provide euro and other symbols
\else % if luatex or xetex
\usepackage{unicode-math}
\defaultfontfeatures{Scale=MatchLowercase}
\defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
\fi
\usepackage{lmodern}
\ifPDFTeX\else
% xetex/luatex font selection
\fi
% Use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
\IfFileExists{microtype.sty}{% use microtype if available
\usepackage[]{microtype}
\UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
}{}
\makeatletter
\@ifundefined{KOMAClassName}{% if non-KOMA class
\IfFileExists{parskip.sty}{%
\usepackage{parskip}
}{% else
\setlength{\parindent}{0pt}
\setlength{\parskip}{6pt plus 2pt minus 1pt}}
}{% if KOMA class
\KOMAoptions{parskip=half}}
\makeatother
\usepackage{xcolor}
\setlength{\emergencystretch}{3em} % prevent overfull lines
\setcounter{secnumdepth}{5}
% Make \paragraph and \subparagraph free-standing
\makeatletter
\ifx\paragraph\undefined\else
\let\oldparagraph\paragraph
\renewcommand{\paragraph}{
\@ifstar
\xxxParagraphStar
\xxxParagraphNoStar
}
\newcommand{\xxxParagraphStar}[1]{\oldparagraph*{#1}\mbox{}}
\newcommand{\xxxParagraphNoStar}[1]{\oldparagraph{#1}\mbox{}}
\fi
\ifx\subparagraph\undefined\else
\let\oldsubparagraph\subparagraph
\renewcommand{\subparagraph}{
\@ifstar
\xxxSubParagraphStar
\xxxSubParagraphNoStar
}
\newcommand{\xxxSubParagraphStar}[1]{\oldsubparagraph*{#1}\mbox{}}
\newcommand{\xxxSubParagraphNoStar}[1]{\oldsubparagraph{#1}\mbox{}}
\fi
\makeatother
\providecommand{\tightlist}{%
\setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}\usepackage{longtable,booktabs,array}
\usepackage{calc} % for calculating minipage widths
% Correct order of tables after \paragraph or \subparagraph
\usepackage{etoolbox}
\makeatletter
\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{}
\makeatother
% Allow footnotes in longtable head/foot
\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}}
\makesavenoteenv{longtable}
\usepackage{graphicx}
\makeatletter
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
\makeatother
% Scale images if necessary, so that they will not overflow the page
% margins by default, and it is still possible to overwrite the defaults
% using explicit options in \includegraphics[width, height, ...]{}
\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
% Set default figure placement to htbp
\makeatletter
\def\fps@figure{htbp}
\makeatother
\addtolength{\oddsidemargin}{-.5in}%
\addtolength{\evensidemargin}{-1in}%
\addtolength{\textwidth}{1in}%
\addtolength{\textheight}{1.7in}%
\addtolength{\topmargin}{-1in}%
\usepackage{tikz}
\usetikzlibrary{decorations.pathreplacing}
\usetikzlibrary{positioning}
\usetikzlibrary{shapes.arrows,shapes.symbols,arrows.meta}
\usepackage[no-weekday]{eukdate}
\usepackage{bm}
\usepackage{mathptmx}
\usepackage{amsmath} % Ensure amsmath is included for \eqref
\usepackage[]{natbib}
\usepackage[top=2cm, bottom=2cm, left=2.5cm, right=2.5cm]{geometry}
\allowdisplaybreaks
\sloppy
\setlength{\parskip}{0pt}
\setlength{\parindent}{15pt}
\expandafter\def\expandafter\normalsize\expandafter{%
\normalsize
\setlength\abovedisplayskip{1ex}
\setlength\belowdisplayskip{1ex}
\setlength\abovedisplayshortskip{1ex}
\setlength\belowdisplayshortskip{1ex}
}
\setlength{\bibsep}{0pt plus 0.3ex}
\usepackage{microtype}
\usepackage[onlyrefs]{mathtools}
\setlength{\tabcolsep}{0.11cm}
\usepackage{threeparttable}
\makeatletter
\g@addto@macro\TPT@defaults{\linespread{1}\selectfont}
\def\fps@table{!tbh}
\def\fps@figure{!tbh}
\makeatother
\usepackage{caption}
\DeclareCaptionStyle{italic}[justification=centering]
{labelfont={bf},textfont={it},labelsep=colon}
\captionsetup[figure]{style=italic,format=hang,singlelinecheck=true}
\captionsetup[table]{style=italic,format=hang,singlelinecheck=true}
\widowpenalty=10000
\clubpenalty=10000
\brokenpenalty=10000
\makeatletter
\@ifpackageloaded{caption}{}{\usepackage{caption}}
\AtBeginDocument{%
\ifdefined\contentsname
\renewcommand*\contentsname{Table of contents}
\else
\newcommand\contentsname{Table of contents}
\fi
\ifdefined\listfigurename
\renewcommand*\listfigurename{List of Figures}
\else
\newcommand\listfigurename{List of Figures}
\fi
\ifdefined\listtablename
\renewcommand*\listtablename{List of Tables}
\else
\newcommand\listtablename{List of Tables}
\fi
\ifdefined\figurename
\renewcommand*\figurename{Figure}
\else
\newcommand\figurename{Figure}
\fi
\ifdefined\tablename
\renewcommand*\tablename{Table}
\else
\newcommand\tablename{Table}
\fi
}
\@ifpackageloaded{float}{}{\usepackage{float}}
\floatstyle{ruled}
\@ifundefined{c@chapter}{\newfloat{codelisting}{h}{lop}}{\newfloat{codelisting}{h}{lop}[chapter]}
\floatname{codelisting}{Listing}
\newcommand*\listoflistings{\listof{codelisting}{List of Listings}}
\usepackage{amsthm}
\theoremstyle{plain}
\newtheorem{proposition}{Proposition}[section]
\theoremstyle{remark}
\AtBeginDocument{\renewcommand*{\proofname}{Proof}}
\newtheorem*{remark}{Remark}
\newtheorem*{solution}{Solution}
\newtheorem{refremark}{Remark}[section]
\newtheorem{refsolution}{Solution}[section]
\makeatother
\makeatletter
\makeatother
\makeatletter
\@ifpackageloaded{caption}{}{\usepackage{caption}}
\@ifpackageloaded{subcaption}{}{\usepackage{subcaption}}
\makeatother
\ifLuaTeX
\usepackage{selnolig} % disable illegal ligatures
\fi
\usepackage[]{natbib}
\bibliographystyle{agsm}
\usepackage{bookmark}
\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
\urlstyle{same} % disable monospaced font for URLs
\hypersetup{
pdftitle={Online conformal inference for multi-step time series forecasting},
pdfauthor={Xiaoqian Wang; Rob J Hyndman},
pdfkeywords={Conformal prediction, Coverage
guarantee, Distribution-free inference, Exchangeability, Weighted
quantile estimate},
colorlinks=true,
linkcolor={blue},
filecolor={Maroon},
citecolor={Blue},
urlcolor={Blue},
pdfcreator={LaTeX via pandoc}}
\begin{document}
\def\spacingset#1{\renewcommand{\baselinestretch}%
{#1}\small\normalsize} \spacingset{1}
\renewcommand*{\arraystretch}{0.5} % Specify row height in a table globally
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\title{\bf Online conformal inference for multi-step time series
forecasting}
\author{
Xiaoqian Wang\thanks{Corresponding author. Xiaoqian Wang, Department of
Econometrics \& Business Statistics, Monash University, Clayton VIC,
3800, Australia. E-mail address:
\href{mailto:[email protected]}{[email protected]} (X.
Wang).} \vspace{0.2em}\\
Department of Econometrics \& Business Statistics, Monash
University \vspace{0.2em}\\
and \vspace{0.2em}\\Rob J Hyndman \vspace{0.2em}\\
Department of Econometrics \& Business Statistics, Monash
University \vspace{0.2em}\\
}
\maketitle
\bigskip
\bigskip
\begin{abstract}
We consider the problem of constructing distribution-free prediction
intervals for multi-step time series forecasting, with a focus on the
temporal dependencies inherent in multi-step forecast errors. We
establish that the optimal \(h\)-step-ahead forecast errors exhibit
serial correlation up to lag \((h-1)\) under a general non-stationary
autoregressive data generating process. To leverage these properties, we
propose the Autocorrelated Multi-step Conformal Prediction (AcMCP)
method, which effectively incorporates autocorrelations in multi-step
forecast errors, resulting in more statistically efficient prediction
intervals. This method ensures theoretical long-run coverage guarantees
for multi-step prediction intervals, though we note that increased
forecasting horizons may exacerbate deviations from the target coverage,
particularly in the context of limited sample sizes. Additionally, we
extend several easy-to-implement conformal prediction methods,
originally designed for single-step forecasting, to accommodate
multi-step scenarios. Through empirical evaluations, including
simulations and applications to data, we demonstrate that AcMCP achieves
coverage that closely aligns with the target within local windows, while
providing adaptive prediction intervals that effectively respond to
varying conditions.
\end{abstract}
\noindent%
{\it Keywords:} Conformal prediction, Coverage
guarantee, Distribution-free inference, Exchangeability, Weighted
quantile estimate
\vfill
\newpage
%\spacingset{1.8} % DON'T change the spacing!
\setstretch{1.667}
\renewcommand{\theproposition}{\arabic{proposition}} % numbering propositions sequentially and individually by type and not by section
\setcounter{proposition}{0}
\section{Introduction}\label{sec-intro}
Conformal prediction \citep{vovk2005} is a powerful and flexible tool
for uncertainty quantification, distinguished by its simplicity,
generality, and ease of implementation. It constructs valid prediction
intervals that achieve nominal coverage without imposing stringent
assumptions on the data generating distribution, other than requiring
the data to be i.i.d. or, more generally, exchangeable. Its credibility
and potential make it widely applicable for quantifying the uncertainty
of predictions produced by any black-box machine learning model
\citep{shafer2008, papadopoulos2008, barber2021} or non-parametric model
\citep{lei2014}.
Three key classes of conformal prediction methods are widely used for
constructing distribution-free prediction intervals: split conformal
prediction \citep{vovk2005}, full conformal prediction \citep{vovk2005},
and jackknife+ \citep{barber2021}. The split conformal method, which
relies on a holdout set, offers computational efficiency but sacrifices
some statistical efficiency due to data splitting. In contrast, full
conformal prediction avoids data splitting, providing higher accuracy at
the cost of increased computational complexity. Jackknife+ strikes a
balance between these methods, offering a compromise between statistical
precision and computational demands. All three methods guarantee
coverage at the target level under the assumption of data
exchangeability.
Nevertheless, the data exchangeability assumption is often violated in
many applied domains, where challenges such as non-stationarity,
distributional drift, temporal and spatial dependencies are prevalent.
In response, several extensions to conformal prediction have been
proposed to handle non-exchangeable data. Notable examples include
methods for handling covariate shift
\citep{tibshirani2019, lei2021, yang2024}, online distribution shift
\citep{gibbs2021, zaffran2022, bastani2022}, label shift
\citep{podkopaev2021}, time series data
\citep{chernozhukov2018, gibbs2021, xu2021, xu2023, zaffran2022}, and
spatial prediction \citep{mao2024}, and methods based on certain
distributional assumptions of the data rather than exchangeability
\citep{oliveira2024, xu2021, xu2023}. Additionally, some methods propose
weighting nonconformity scores (e.g., prediction errors) differently,
either using non-data-dependent weights \citep{barber2023} or weights
based on observed feature values
\citep{tibshirani2019, guan2023, hore2023}.
Recently, several attempts have been made to enable conformal prediction
on time series data, where exchangeability obviously fails due to
inherent temporal dependencies. One line of research has focused on
developing conformal-type methods that offer coverage guarantees under
certain relaxations of exchangeability. For example, within the full
conformal prediction framework, \citet{chernozhukov2018} and
\citet{yu2022} construct prediction sets for time series by using a
group of permutations that are specifically designed to preserve the
dependence structure in the data, ensuring validity under weak
assumptions on the nonconformity score. In the split conformal
prediction framework, \citet{xu2021} and \citet{xu2023} extend conformal
prediction methods under classical nonparametric assumptions to achieve
asymptotic valid conditional coverage for time series.
\citet{barber2023} use weighted residual distributions to provide
robustness against distribution drift. Additionally,
\citet{oliveira2024} introduce a general framework based on
concentration inequalities and data decoupling properties of the data to
retain asymptotic coverage guarantees across several dependent data
settings.
In a separate strand of research, \citet{gibbs2021} develop adaptive
conformal inference in an online manner to manage temporal distribution
shifts and ensure long-run coverage guarantees. The basic idea is to
adapt the miscoverage rate, \(\alpha\), based on historical miscoverage
frequencies. Follow-up work has refined this idea by introducing
time-dependent step sizes to respond to arbitrary distribution shifts,
as seen in studies by \citet{bastani2022}, \citet{zaffran2022}, and
\citet{gibbs2024}. However, these methods may produce prediction
intervals that are either infinite or null. To address this issue,
recent research has proposed a generalized updating process that tracks
the quantile of the nonconformity score sequence, as discussed by
\citet{bhatnagar2023}, \citet{angelopoulos2024}, and
\citet{angelopoulos2024online}.
Existing conformal prediction methods for time series primarily focus on
single-step forecasting. However, many applications require forecasts
for multiple future time steps, not just one. Related research into
multi-step time series forecasting is limited and does not account for
the temporal dependencies inherent in multi-step forecasts. For example,
\citet{stankeviciute2021} integrate conformal prediction with recurrent
neural networks for multi-step forecasting and then apply Bonferroni
correction to achieve the desired coverage rate. This approach, however,
assumes data independence, which is not often unrealistic for time
series data. \citet{yang2024ts} propose Bellman conformal inference, an
extension of adaptive conformal prediction, to control multi-step
miscoverage rates simultaneously at each time point \(t\) by minimizing
a loss function that balances the average interval length across
forecast horizons with miscoverage. While this method considers
multi-step intervals, it does not account for their temporal
dependencies and may be computationally intensive when solving the
associated optimisation problems at each time step. Additionally,
several extensions to multivariate targets have been explored, see,
e.g., \citet{schlembach2022} and \citet{sun2022}.
We employ a unified notation to formalize the mathematical
representation of conformal prediction for time series data. We consider
a general sequential setting in which we observe a time series
\(\{y_t\}_{t \geq 1}\) generated by an unknown data generating process
(DGP), which may depend on its own past, along with other exogenous
predictors, \(\bm{x}_t=(x_{1,t},\ldots,x_{p,t})^{\prime}\), and their
histories. The distribution of
\(\{(\bm{x}_t, y_t)\}_{t \geq 1} \subseteq \mathbb{R}^p \times \mathbb{R}\)
is allowed to vary over time, to accommodate non-stationary processes.
At each time point \(t\), we aim to forecast \(H\) steps into the
future, providing a \emph{prediction set} (which is a prediction
interval in this setting), \(\hat{\mathcal{C}}_{t+h|t}\), for the
realisation \(y_{t+h}\) for each \(h\in[H]\). The \(h\)-step-ahead
forecast uses the previously observed data
\(\{(\bm{x}_i, y_i)\}_{1 \leq i \leq t}\) along with the new information
of the exogenous predictors \(\{\bm{x}_{t+j}\}_{1\leq j\leq h}\). Note
that we can generate ex-ante forecasts by using forecasts of the
predictors based on information available up to and including time
\(t\). Alternatively, ex-post forecasts are generated assuming that
actual values of the predictors from the forecast period are available.
Given a nominal \emph{miscoverage rate} \(\alpha \in (0,1)\) specified
by the user, we expect the output \(\hat{\mathcal{C}}_{t+h|t}\) to be a
\emph{valid} prediction interval so that \(y_{t+h}\) falls within the
prediction interval \(\hat{\mathcal{C}}_{t+h|t}\) at least
\(100(1-\alpha)\%\) of the time.
Our goal is to achieve long-run coverage for multi-step univariate time
series forecasting. All the proposed methods are grounded in the split
conformal prediction framework and an online learning scheme, which are
well-suited to the sequential nature of time series data. First, we
extend several widely used conformal prediction methods that are
originally designed for single-step forecasting to accommodate
multi-step forecasting scenarios. Second, we provide theoretical proofs
demonstrating that the forecast errors of optimal \(h\)-step-ahead
forecasts approximate a linear combination of at most its lag \((h-1)\)
with respect to forecast horizon when we assume a general non-stationary
autoregressive data generating process. Third, we introduce the
autocorrelated multi-step conformal prediction (AcMCP) method, which
accounts for the autocorrelations of multi-step forecast errors. Our
method is proven to achieve long-run coverage guarantees without making
any assumptions on data distribution shifts. \citet{lopes2024} introduce
ConForME, a method that focuses on generating uniform prediction
intervals across the entire forecast horizon, aiming to ensure the
overall validity of these intervals. In contrast, our method targets
pointwise prediction intervals for each specific forecast horizon,
\(h\in[H]\). We also highlight that for \(t \ll \infty\), increasing the
forecast horizon \(h\) generally leads to greater deviations from the
target coverage, which aligns with our expectations. Finally, we
illustrate the practical utility of these proposed methods through two
simulations and two applications to electricity demand and eating-out
expenditure forecasting.
We developed the \texttt{conformalForecast} package for R, available at
\url{https://github.com/xqnwang/conformalForecast}, to implement the
proposed multi-step conformal prediction methods. All the data and code
to reproduce the experiments are made available at
\url{https://github.com/xqnwang/cpts}.
\section{Online learning with sequential splits}\label{sec-setup}
Let \(z_t = (\bm{x}_t, y_t)\) denote the data point (including the
response \(y_t\) and possibly predictors \(\bm{x}_t\)) at time \(t\).
Suppose that, at each time \(t\), we have a forecasting model
\(\hat{f}_t\) trained using the historical data \(z_{1:t}\). Throughout
the paper, we assume that the predictors are known into the future. In
this way, we perform ex-post forecasting and there is no additional
uncertainty introduced from forecasting the exogenous predictors. Using
the forecasting model \(\hat{f}_t\), we are able to produce \(H\)-step
point forecasts, \(\{\hat{y}_{t+h|t}\}_{h\in[H]}\), using the future
values for the predictors. We define the \emph{nonconformity score} as
the (signed) forecast error \[
s_{t+h|t}=\mathcal{S}(z_{1:t}, y_{t+h}):=y_{t+h}-\hat{f}_t(z_{1:t},\bm{x}_{t+1:h})=y_{t+h}-\hat{y}_{t+h|t}.
\] The task is to employ conformal inference to build \(H\)-step
prediction intervals,
\(\{\hat{\mathcal{C}}_{t+h|t}^{\alpha}(z_{1:t},\bm{x}_{t+1:h})\}_{h\in[H]}\),
at the target coverage level \(1-\alpha\). For brevity, we will use
\(\hat{\mathcal{C}}_{t+h|t}^{\alpha}\) to denote the \(h\)-step-ahead
\(100(1-\alpha)\%\) prediction interval.
\textbf{Sequential split.} In a time series context, it is inappropriate
to perform \emph{random splitting}, a standard strategy in much of the
conformal prediction literature, due to the temporal dependency present
in the data. Instead, throughout the conformal prediction methods
proposed in this paper, we use a \emph{sequential split} to preserve the
temporal structure. For example, the \(t\) available data points,
\(z_{1:t}\), are sequentially split into two consecutive sets, a
\emph{proper training set}
\(\mathcal{D}_{\text{tr}} \subset \{1,\ldots,t_r\}\) and a
\emph{calibration set}
\(\mathcal{D}_{\text{cal}} \subset \{t_r+1,\ldots,t\}\), where
\(t_c=t-t_r \gg H\),
\(\mathcal{D}_{\text{tr}} \cup \mathcal{D}_{\text{cal}} = \{1, \ldots, t\}\),
and
\(\mathcal{D}_{\text{tr}} \cap \mathcal{D}_{\text{cal}} = \varnothing\).
Here, ``proper'' means that the training set is used exclusively for
fitting the model, with no overlap into the calibration set, which is
essential for ensuring the validity of coverage in conformal prediction
\citep{papadopoulos2007}. With sequential splitting, multiple \(H\)-step
forecasts and their respective nonconformity scores can be computed on
the calibration set.
\textbf{Online learning.} We will adapt the following generic online
learning framework for all conformal prediction methods to be discussed
in later sections. The entire procedure for the online learning
framework with sequential splits is also illustrated in
\autoref{fig-flowchart}. This framework updates prediction intervals as
new data points arrive, allowing us to assess their long-run coverage
behaviour. It adopts a standard rolling window evaluation strategy,
although expanding windows could easily be used instead.
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
\emph{Initialization}. Train a forecasting model on the initial proper
training set \(z_{(1+t-t_r):t}\), setting \(t=t_r\). Then generate
\(H\)-step-ahead point forecasts \(\{\hat{y}_{t+h|t}\}_{h\in[H]}\) and
compute the corresponding nonconformity scores
\(\{s_{t+h|t}=\mathcal{S}(z_{(1+t-t_r):t}, y_{t+h})\}_{h\in[H]}\)
based on the true values \(H\) steps ahead,
i.e.~\(\{y_{t+h}\}_{h\in[H]}\).
\item
\emph{Recurring procedure}. Roll the training set forward by one data
point by setting \(t \rightarrow t+1\). Then repeat step 1 until the
nonconformity scores on the entire initial calibration set,
\(\{s_{t+h|t}\}_{t_r \leq t \leq t_r+t_c-h}\) for \(h\in[H]\), are
computed.
\item
\emph{Quantile estimation and prediction interval calculation}. Use
nonconformity scores obtained from the calibration set to perform
quantile estimation and compute \(H\)-step-ahead prediction intervals
on the test set.
\item
\emph{Online updating}. Continuously roll the training set and
calibration set forward, one data point at a time, to update the
nonconformity scores for calibration, and then repeat step 3 until
prediction intervals for the entire test set are obtained. That is,
\(\{\hat{\mathcal{C}}_{t+h|t}^{\alpha}\}_{t_r+t_c \leq t \leq T-H}\)
for \(h \in [H]\), where \(T-t_r-t_c\) is the length of the test set
used for testing coverage. Our goal is to achieve long-run coverage in
time.
\end{enumerate}
\begin{figure}[!hbtp]
\vspace*{-1.5cm}
\hspace*{-2.5cm}
\input figs/annotation.tex
\vspace*{-1.0cm}
\caption{Diagram of the online learning framework with sequential splits. White: unused data; Gray: training data; Black: forecasts in calibration set; Blue: forecasts in test set.}\label{fig-flowchart}
\end{figure}
\section{Extending conformal prediction for multi-step
forecasting}\label{sec-ext}
In this section, we apply the online learning framework outlined in
Section~\ref{sec-setup} to adapt several popular conformal prediction
methods in order to allow for multi-step forecasting problems. In time
series forecasting, when a model is properly specified and well-trained,
the forecasts that minimize the mean squared error (MSE) can be
considered optimal in the sense that they achieve the lowest possible
expected squared deviation between the forecasts and actual values. One
of the key properties of optimal forecast errors, which holds generally
in linear models, is that the variance of the forecast error
\(e_{t+h|t}\) is non-decreasing in \(h\)
\citep{tong1990, diebold1996, patton2007}. Therefore, we need to apply a
separate conformal prediction procedure for each \(h \in [H]\).
\subsection{Online multi-step split conformal
prediction}\label{online-multi-step-split-conformal-prediction}
Split conformal prediction \citep[SCP, also called inductive conformal
prediction,][]{papadopoulos2002, vovk2005, lei2018}, is a holdout method
for building prediction intervals using a pre-trained model in
regression settings. A key advantage of SCP is its ability to guarantee
coverage by assuming data exchangeability. Time series data are
inherently nonexchangeable due to their temporal dependence and
autocorrelation. Therefore, directly applying SCP to time series data
would violate the method's exchangeability assumption and compromise its
coverage guarantee.
Here we introduce online \textbf{multi-step split conformal prediction}
(MSCP) as a generalization of SCP to recursively update all
\(h\)-step-ahead prediction intervals over time. Instead of assuming
exchangeability, MSCP applies conformal inference in an online fashion,
updating prediction intervals as new data points are received.
Specifically, for each \(h \in [H]\), we consider the following simple
online update to construct prediction intervals on the test set:
\begin{equation}\phantomsection\label{eq-mscp}{
\hat{\mathcal{C}}_{t+h|t}^{\alpha} = \Bigg\{y\in\mathbb{R}: s_{t+h|t}^{y} \leq Q_{1-\alpha}\Bigg(\sum_{i=t-t_c+1}^{t}\frac{1}{t_c+1}\cdot\delta_{s_{i|i-h}}+\frac{1}{t_c+1}\cdot\delta_{+\infty}\Bigg)\Bigg\},
}\end{equation} where \(s_{t+h|t}^{y}:=\mathcal{S}(z_{1:t}, y)\) denotes
the \(h\)-step-ahead nonconformity score calculated at time \(t\) using
a hypothesized test observation \(y\), \(Q_\tau(\cdot)\) denotes the
\(\tau\)-quantile of its argument, and \(\delta_a\) denotes the point
mass at \(a\).
\subsection{Online multi-step weighted conformal
prediction}\label{online-multi-step-weighted-conformal-prediction}
\citet{barber2023} propose a nonexchangeable conformal prediction
procedure (NexCP) that generalizes the SCP method to allow for some
sources of nonexchangeability. The core idea is that a higher weight
should be assigned to a data point that is believed to originate from
the same distribution as the test data. Note that NexCP assumes the
weights are fixed and data-independent. When the data are exchangeable,
NexCP offers the same coverage guarantees as SCP, while the coverage gap
is characterized by the total variation between the swapped
nonconformity score vectors when exchangeability is violated. Thus the
coverage gap may be quite large in time series contexts.
The online \textbf{multi-step weighted conformal prediction} (MWCP)
method we propose here adapts the NexCP method to the online setting for
time series forecasting. MWCP uses weighted quantile estimate for
constructing prediction intervals, contrasting with the MSCP definitions
where all nonconformity scores for calibration are implicitly assigned
equal weight.
We choose fixed weights \(w_i = b^{t+1-i}\), \(b \in (0, 1)\) and
\(i=t-t_c+1,\ldots,t\), for nonconformity scores on the corresponding
calibration set. In this setting, weights decay exponentially as the
nonconformity scores get older, akin to the rationale behind the simple
exponential smoothing method in time series forecasting
\citep{hyndman2021}. Then for each \(h \in [H]\), MWCP updates the
\(h\)-step-ahead prediction interval: \[
\hat{\mathcal{C}}_{t+h|t}^{\alpha} = \Bigg\{y\in\mathbb{R}: s_{t+h|t}^{y} \leq Q_{1-\alpha}\Bigg(\sum_{i=t-t_c+1}^{t}\tilde{w}_i\cdot\delta_{s_{i|i-h}}+\tilde{w}_{t+1}\cdot\delta_{+\infty}\Bigg)\Bigg\},
\] where \(\tilde{w}_i\) and \(\tilde{w}_{t+1}\) are normalized weights
given by \[
\tilde{w}_i = \frac{w_i}{\sum_{i=t-t_c+1}^{t}w_i+1}, \text{ for } i \in \{t-t_c+1,\ldots,t\} \quad \text{and} \quad \tilde{w}_{t+1} = \frac{1}{\sum_{i=t-t_c+1}^{t}w_i+1}.
\]
\subsection{Multi-step adaptive conformal
prediction}\label{multi-step-adaptive-conformal-prediction}
Next we extend the adaptive conformal prediction (ACP) method proposed
by \citet{gibbs2021} to address multi-step time series forecasting,
introducing the \textbf{multi-step adaptive conformal prediction} (MACP)
method. Assuming that
\(\beta \mapsto \mathbb{P}(y_{t+h} \in \hat{\mathcal{C}}_{t+h|t}^{\beta})\)
is continuous and non-increasing, with
\(\mathbb{P}(y_{t+h} \in \hat{\mathcal{C}}_{t+h|t}^{0}) = 1\) and
\(\mathbb{P}(y_{t+h} \in \hat{\mathcal{C}}_{t+h|t}^{1}) = 0\), an
optimal value \(\alpha_{t+h|t}^{*} \in [0,1]\) exists such that the
realised miscoverage rate of the corresponding prediction interval
closely approximates the nominal miscoverage rate \(\alpha\).
Specifically, for each \(h \in [H]\), we iteratively estimate
\(\alpha_{t+h|t}^{*}\) by updating a parameter \(\alpha_{t+h|t}\)
through a sequential adjustment process
\begin{equation}\phantomsection\label{eq-macp}{
\alpha_{t+h|t} := \alpha_{t+h-1|t-1} + \gamma(\alpha - \mathrm{err}_{t|t-h}).
}\end{equation} Then the \(h\)-step-ahead prediction interval is
computed using Equation \eqref{eq-mscp} by setting
\(\alpha = \alpha_{t+h|t}\). Here, \(\gamma > 0\) denotes a fixed step
size parameter, \(\alpha_{2h|h}\) denotes the initial estimate typically
set to \(\alpha\), and \(\mathrm{err}_{t|t-h}\) denotes the miscoverage
event
\(\mathrm{err}_{t|t-h} = \mathbb{1}\left\{y_t \notin \hat{\mathcal{C}}_{t|t-h}^{\alpha_{t|t-h}}\right\}\).
Equation \eqref{eq-macp} indicates that the correction to the estimation
of \(\alpha_{t+h|t}^{*}\) at time \(t+h\) is determined by the
historical miscoverage frequency up to time \(t\). At each iteration, we
raise the estimate of \(\alpha_{t+h|t}^{*}\) used for quantile
estimation at time \(t+h\) if
\(\hat{\mathcal{C}}_{t|t-h}^{\alpha_{t|t-h}}\) covered \(y_t\), whereas
we lower the estimate if \(\hat{\mathcal{C}}_{t|t-h}^{\alpha_{t|t-h}}\)
did not cover \(y_t\). Thus the miscoverage event has a delayed impact
on the estimation of \(\alpha_{t+h|t}^{*}\) over \(h\) periods,
indicating that the correction of the \(\alpha_{t+h|t}^{*}\) estimate
becomes less prompt with increasing values of \(h\). In particular,
Equation \eqref{eq-macp} reduces to the update for ACP for \(h=1\).
We do not consider the update equation
\(\alpha_{t+1|t-h+1} := \alpha_{t|t-h} + \gamma(\alpha - \mathrm{err}_{t|t-h})\)
in this context, as the available information at time \(t\) is
insufficient to estimate \(\alpha_{t+h|t}^{*}\) required for \(h\)-step
forecasts.
Selecting the parameter \(\gamma\) is pivotal yet challenging.
\citet{gibbs2021} suggest setting \(\gamma\) in proportion to the degree
of variation of the unknown \(\alpha_{t}^{*}\) over time. Several
strategies have been proposed to avoid the necessity of selecting
\(\gamma\). For example, \citet{zaffran2022} use an adaptive aggregation
of multiple ACPs with a set of candidate values for \(\gamma\) ,
determining weights based on their historical performance.
\citet{bastani2022} propose a multivalid prediction algorithm in which
the prediction set is established by selecting a threshold from a
sequence of candidate thresholds. However, both previous methods fail to
promptly adapt to the local changes. To address this limitation,
\citet{gibbs2024} suggest adaptively tuning the step size parameter
\(\gamma\) in an online setting, choosing an ``optimal'' value for
\(\gamma\) from a candidate set of values by assessing their historical
performance.
The theoretical coverage properties of ACP suggest that a larger value
for \(\gamma\) generally results in less deviation from the target
coverage. As there is no restriction on \(\alpha_{t+h|t}\) and it can
drift below \(0\) or above \(1\), a larger \(\gamma\) may lead to
frequent output of null or infinite prediction sets in order to quickly
adapt to the current miscoverage status.
\subsection{Multi-step conformal PID
control}\label{multi-step-conformal-pid-control}
We introduce \textbf{multi-step conformal PID control} method (hereafter
MPID), which extends the PID method \citep{angelopoulos2024}, originally
developed for one-step-ahead forecasting. For each individual forecast
horizon \(h\in[H]\), the iteration of the \(h\)-step-ahead quantile
estimate is given by \begin{equation}\phantomsection\label{eq-mpid}{
q_{t+h|t}=\underbrace{q_{t+h-1|t-1}+\eta (\mathrm{err}_{t|t-h}-\alpha)}_{\mathrm{P}}+\underbrace{r_t\Bigg(\sum_{i=h+1}^t (\mathrm{err}_{i|i-h}-\alpha)\Bigg)}_{\mathrm{I}}+\underbrace{\hat{s}_{t+h|t}}_{\mathrm{D}},
}\end{equation} where \(\eta > 0\) is a constant learning rate, and
\(r_t\) is a saturation function that adheres to the following
conditions\pagebreak[1]\begin{equation}\phantomsection\label{eq-saturation_h}{
x \geq c \cdot g(t-h) \Longrightarrow r_t(x) \geq b, \quad \text {and} \quad x \leq-c \cdot g(t-h) \Longrightarrow r_t(x) \leq -b,
}\end{equation} for constant \(b, c > 0\), and an admissible function
\(g\) that is sublinear, nonnegative, and nondecreasing. With this
updating equation, we can obtain all required \(h\)-step-ahead
prediction intervals using information available at time \(t\). When
\(h=1\), Equation \eqref{eq-mpid} simplifies to the PID update, which
guarantees long-run coverage. More importantly, Equation \eqref{eq-mpid}
represents a specific instance of Equation \eqref{eq-acmcp_1} that we
will introduce later, thereby ensuring long-run coverage for each
individual forecast horizon \(h\) according to
Proposition~\ref{prp-cov_acmcp}.
The P control in MPID shows a delayed correction of the quantile
estimate for a length of \(h\) periods. The underlying intuition is
similar to that of MACP: it increases (or decreases) the
\(h\)-step-ahead quantile estimate if the prediction set at time \(t\)
miscovered (or covered) the corresponding realisation. MACP can be
considered as a special case of the P control, while the P control has
the ability to prevent the generation of null or infinite prediction
sets after a sequence of miscoverage events.
The I control accounts for the cumulative historical coverage errors
associated with \(h\)-step-ahead prediction intervals during the update
process, thereby enhancing the stability of the interval coverage.
The D control involves \(\hat{s}_{t+h|t}\) as the \(h\)-step-ahead
forecast of the nonconformity score (i.e., the forecast error), produced
by any suitable forecasting model (or ``scorecaster'') trained using the
\(h\)-step-ahead nonconformity scores available up to and including time
\(t\). This module provides additional benefits only when the
scorecaster is well-designed and there are predictable patterns in the
nonconformity scores; otherwise, it may increase variability in the
coverage and prediction intervals.
\section{Autocorrelated multi-step conformal
prediction}\label{sec-acmcp}
In the PID method proposed by \citet{angelopoulos2024}, a notable
feature is the inclusion of a scorecaster, a model trained on the score
sequence to forecast the future score. The rationale behind it is to
identify any leftover signal in the score distribution not captured by
the base forecasting model. While this is appropriate in the context of
huge data sets and potentially weak learners, it is unlikely to be a
useful strategy for time series models. We expect to use a forecasting
model that leaves only white noise in the residuals (equivalent to the
one-step nonconformity scores). Moreover, the inclusion of a scorecaster
often only introduces variance to the quantile estimate, resulting in
inefficient (wider) prediction intervals.
On the other hand, in any non-trivial context, multi-step forecasts are
correlated with each other. Specifically, the \(h\)-step-ahead forecast
errors \(e_{t+h|t}\) will be correlated with the forecast errors from
the past \(h-1\) steps, as forecast errors accumulate over the forecast
horizon. However, no conformal prediction methods have taken this
potential dependence into account in their methodological construction.
In this section, we will explore the properties of multi-step forecast
errors and propose a novel conformal prediction method that accounts for
their autocorrelations.
\subsection{Properties of multi-step forecast errors}\label{sec-ppt}
We assume that a time series \(\{y_t\}_{t \geq 1}\) is generated by a
general non-stationary autoregressive process given by:
\begin{equation}\phantomsection\label{eq-dgp}{
y_t = f_{t}(y_{(t-d):(t-1)},\bm{x}_{(t-k):t}) + \varepsilon_t,
}\end{equation} where \(f_{t}\) is a nonlinear function in \(d\) lagged
values of \(y_t\), the current value of the exogenous predictors, along
with the preceding \(k\) values, and \(\varepsilon_t\) is white noise
innovation with mean zero and constant variance. The sequence of model
coefficients that parameterizes the function \(f\) is restricted to
ensure that the stochastic process is locally stationary.
\begin{proposition}[Autocorrelations of multi-step optimal forecast
errors]\protect\hypertarget{prp-ar}{}\label{prp-ar}
Let \(\{y_t\}_{t \geq 1}\) be a time series generated by a general
non-stationary autoregressive process as given in Equation
\eqref{eq-dgp}, and assume that any exogenous predictors are known into
the future. The forecast errors for optimal \(h\)-step-ahead forecasts
can be approximately expressed as
\begin{equation}\phantomsection\label{eq-ar}{
e_{t+h|t} = \omega_{t+h} + \phi_1e_{t+h-1|t} + \cdots + \phi_{p}e_{t+h-p|t},
}\end{equation} where \(p=\min\{d, h-1\}\), and \(\omega_{t}\) is white
noise. Therefore, the optimal \(h\)-step-ahead forecast errors are at
most serially correlated to lag \((h-1)\).
\end{proposition}
Proposition~\ref{prp-ar} suggests that the optimal \(h\)-step ahead
forecast error, \(e_{t+h|t}\), is serially correlated with the forecast
errors from at most the past \(h-1\) steps, i.e.,
\(e_{t+1|t}, \ldots, e_{t+h-1|t}\). However, the autocorrelation among
errors associated with optimal forecasts can not be used to improve
forecasting performance, as it does not incorporate any new information
available when the forecast was made. If we could forecast the forecast
error, then we could improve the forecast, indicating that the initial
forecast couldn't have been optimal.
The proof of Proposition~\ref{prp-ar}, provided in
Section~\ref{sec-proof_ar}, suggests that, if \(f_t\) is a linear
autoregressive model, then the coefficients in Equation \eqref{eq-ar}
are the linear coefficients of the optimal forecasting model. However,
when \(f_t\) takes on a more complex nonlinear structure, the
coefficients become complicated functions of observed data and
unobserved model coefficients.
It is well-established in the forecasting literature that, for a
zero-mean covariance-stationary time series, the optimal linear
least-squares forecasts have \(h\)-step-ahead errors that are at most
MA\((h-1)\) process \citep{harvey1997, diebold2024}, a property that can
be derived using Wold's representation theorem. Here, we extend the
investigation to time series generated by a general non-stationary
autoregressive process, and provide the corresponding proposition. The
proof of this proposition is provided in Section~\ref{sec-proof_ma}.
\begin{proposition}[MA\((h-1)\) process for \(h\)-step-ahead optimal
forecast errors]\protect\hypertarget{prp-ma}{}\label{prp-ma}
Let \(\{y_t\}_{t \geq 1}\) be a time series generated by a general
non-stationary autoregressive process as given in Equation
\eqref{eq-dgp}, and assume that any exogenous predictors are known into
the future. Then the forecast errors of optimal \(h\)-step-ahead
forecasts follow an approximate MA(\(h-1\)) process
\begin{equation}\phantomsection\label{eq-ma}{
e_{t+h|t} = \omega_{t+h} + \theta_1\omega_{t+h-1} + \cdots + \theta_{h-1}\omega_{t+1}.
}\end{equation} where \(\omega_{t}\) is white noise.
\end{proposition}
\subsection{The AcMCP method}\label{sec-novel}
We can exploit these properties of multi-step forecast errors, leading
to a new method that we call the \textbf{autocorrelated multi-step
conformal prediction} (AcMCP) method. Unlike extensions of existing
conformal prediction methods, which treat multi-step forecasting as
independent events (see Section~\ref{sec-ext}), the AcMCP method
integrates the autocorrelations inherent in multi-step forecast errors,
thereby making the output multi-step prediction intervals more
statistically efficient.
The AcMCP method updates the quantile estimate \(q_t\) in an online
setting to achieve the goal of long-run coverage. Specifically, the
iteration of the \(h\)-step-ahead quantile estimate is given by
\begin{equation}\phantomsection\label{eq-acmcp}{
q_{t+h|t}=q_{t+h-1|t-1}+\eta (\mathrm{err}_{t|t-h}-\alpha)+r_t\Bigg(\sum_{i=h+1}^t (\mathrm{err}_{i|i-h}-\alpha)\Bigg)+\tilde{e}_{t+h|t},
}\end{equation} for \(h\in[H]\). Obviously, the AcMCP method can be
viewed as a further extension of the PID method. Nevertheless, AcMCP
diverges from PID with several innovations and differences.
First, we are no longer confined to predicting just one step ahead.
Instead, we can make multi-step forecasts with accompanying theoretical
coverage guarantees, constructing distribution-free prediction intervals
for steps \(t+1,\ldots,t+H\) based on available information up to time
\(t\).
Additionally, in AcMCP, \(\tilde{e}_{t+h|t}\) is a forecast combination
of two simple models: one being an MA\((h-1)\) model trained on the
\(h\)-step-ahead forecast errors available up to and including time
\(t\) (i.e.~\(e_{1+h|1}, \ldots, e_{t|t-h}\)), and the other a linear
regression model trained by regressing \(e_{t+h|t}\) on forecast errors
from past steps (i.e.~\(e_{t+h-1|t}, \ldots, e_{t+1|t}\)). Thus, we
perform multi-step conformal prediction recursively, contrasting with
the independent approach employed in MPID. Moreover, the inclusion of
\(\tilde{e}_{t+h|t}\) is not intended to forecast the nonconformity
scores (i.e., forecast errors), but rather to incorporate
autocorrelations present in multi-step forecast errors within the
resulting multi-step prediction intervals.
\subsection{Coverage guarantees}\label{coverage-guarantees}
\begin{proposition}[]\protect\hypertarget{prp-cov_rt}{}\label{prp-cov_rt}
Let \(\{s_{t+h|t}\}_{t\in\mathbb{N}}\) be any sequence of numbers in
\([-b, b]\) for any \(h\in[H]\), where \(b>0\), and may be infinite.
Assume that \(r_t\) is a saturation function obeying Equation
\eqref{eq-saturation_h}, for an admissible function \(g\). Then the
iteration
\(q_{t+h|t}=r_t\left(\sum_{i=h+1}^t(\mathrm{err}_{i|i-h}-\alpha)\right)\)
satisfies \begin{equation}\phantomsection\label{eq-cov_rt}{
\left|\frac{1}{T-h}\sum_{t=h+1}^{T}(\mathrm{err}_{t|t-h}-\alpha)\right| \leq \frac{c \cdot g(T-h) + h}{T-h},
}\end{equation} for any \(T \geq h+1\), where \(c>0\) is the constant in
Equation \eqref{eq-saturation_h}.
Therefore the prediction intervals obtained by the iteration yield the
correct long-run coverage; i.e.,
\(\lim _{T \rightarrow \infty} \frac{1}{T-h} \sum_{t=h+1}^T \mathrm{err}_{t|t-h} = \alpha\).
\end{proposition}
Proposition~\ref{prp-cov_rt} indicates that, for \(t \ll \infty\),
increasing the forecast horizon \(h\) tends to amplify deviations from
the target coverage because \(g(T-h)/(t-h)\) is non-increasing, given
that the admissible function \(g\) is sublinear, nonnegative, and
nondecreasing. This is consistent with expectations, as extending the
forecast horizon generally increases forecast uncertainty. As
predictions extend further into the future, more factors contribute to
variability and uncertainty. In this case, conformal prediction
intervals may not scale perfectly with the increasing uncertainty,
leading to a larger discrepancy between the desired and actual coverage.
The quantile iteration
\(q_{t+h|t}=q_{t+h-1|t-1}+\eta (\mathrm{err}_{t|t-h}-\alpha)\) can be
seen as a particular instance of the iteration outlined in
Proposition~\ref{prp-cov_rt} if we set \(q_{2h|h}=0\) without losing
generality. Thus, its coverage bounds can be easily derived as a result
of Proposition~\ref{prp-cov_rt}.
\begin{proposition}[]\protect\hypertarget{prp-cov_qt}{}\label{prp-cov_qt}
Let \(\{s_{t+h|t}\}_{t\in\mathbb{N}}\) be any sequence of numbers in
\([-b, b]\) for any \(h\in[H]\), where \(b>0\), and may be infinite.
Then the iteration
\(q_{t+h|t}=q_{t+h-1|t-1}+\eta (\mathrm{err}_{t|t-h}-\alpha)\) satisfies
\[
\left|\frac{1}{T-h}\sum_{t=h+1}^{T}(\mathrm{err}_{t|t-h}-\alpha)\right| \leq \frac{b + \eta h}{\eta(T-h)},
\] for any learning rate \(\eta > 0\) and \(T \geq h+1\).
Therefore the prediction intervals obtained by the iteration yield the
correct long-run coverage; i.e.,
\(\lim _{T \rightarrow \infty} \frac{1}{T-h} \sum_{t=h+1}^T \mathrm{err}_{t|t-h} = \alpha\).
\end{proposition}
More importantly, Proposition~\ref{prp-cov_rt} is also adequate for
establishing the coverage guarantee of the proposed AcMCP method given
byEquation \eqref{eq-acmcp}. Following \citet{angelopoulos2024}, we
first reformulate Equation \eqref{eq-acmcp} as
\begin{equation}\phantomsection\label{eq-acmcp_1}{
q_{t+h|t}=\hat{q}_{t+h|t}+r_t\left(\sum_{i=h+1}^t (\mathrm{err}_{i|i-h}-\alpha)\right),
}\end{equation} where \(\hat{q}_{t+h|t}\) can be any function of the
past observations \(\{(\bm{x}_i, y_i)\}_{1 \leq i \leq t}\) and quantile
estimates \(q_{i+h|i}\) for \(i \leq t-1\). Taking
\(\hat{q}_{t+h|t}=q_{t+h-1|t-1}+\eta (\mathrm{err}_{t|t-h}-\alpha)+\tilde{e}_{t+h|t}\)
will recover Equation \eqref{eq-acmcp}. We can consider
\(\hat{q}_{t+h|t}\) as the forecast of the quantile \(q_{t+h|t}\) based
on available historical data. We then present the coverage guarantee for
AcMCP given by Equation \eqref{eq-acmcp_1}.
\begin{proposition}[]\protect\hypertarget{prp-cov_acmcp}{}\label{prp-cov_acmcp}
Let \(\{\hat{q}_{t+h|t}\}_{t\in\mathbb{N}}\) be any sequence of numbers
in \([-\frac{b}{2}, \frac{b}{2}]\), and
\(\{s_{t+h|t}\}_{t\in\mathbb{N}}\) be any sequence of numbers in
\([-\frac{b}{2},\frac{b}{2}]\), for any \(h\in[H]\), \(b>0\) and may be
infinite. Assume that \(r_t\) is a saturation function obeying Equation
\eqref{eq-saturation_h}, for an admissible function \(g\). Then the
prediction intervals obtained by the AcMCP iteration given by Equation
\eqref{eq-acmcp_1} yield the correct long-run coverage; i.e.,
\(\lim _{T \rightarrow \infty} \frac{1}{T-h} \sum_{t=h+1}^T \mathrm{err}_{t|t-h} = \alpha\).
\end{proposition}
\section{Experiments}\label{experiments}
We examine the empirical performance of the proposed multi-step
conformal prediction methods using two simulated data settings and two
real data examples. Throughout the experiments, we adhere to the
following parameter settings:
\begin{enumerate}
\def\labelenumi{(\alph{enumi})}
\tightlist
\item
we focus on the target coverage level \(1-\alpha=0.9\);
\item
for the MWCP method, we use \(b=0.99\) as per \citet{barber2023};
\item
following \citet{angelopoulos2024}, we use a step size parameter of
\(\gamma=0.005\) for the MACP method, a Theta model as the scorecaster
in the MPID method, and a learning rate of \(\eta=0.01\hat{B}_t\) for
quantile tracking in the MPID and AcMCP methods, where
\(\hat{B}_t=\max\{s_{t-\Delta+1|t-\Delta-h+1},\dots,s_{t|t-h}\}\) is
the highest score over a tailing window and the window length
\(\Delta\) is set to be same as the length of the calibration set;
\item
we adopt a nonlinear saturation function defined as
\(r_t(x)=K_1 \tan \left(x \log (t) / (t C_{\text {sat }})\right)\),
where \(\tan (x)=\operatorname{sign}(x) \cdot \infty\) for
\(x \notin[-\pi / 2, \pi / 2]\), and constants
\(C_{\text {sat }}, K_{\mathrm{I}}>0\) are chosen heuristically, as
suggested by \citet{angelopoulos2024};
\item
we consider a clipped version of MACP by imputing infinite intervals
with the largest score seen so far.
\end{enumerate}
\subsection{Simulated linear autoregressive
process}\label{simulated-linear-autoregressive-process}
We first consider a simulated stationary time series which is generated
from a simple AR\((2)\) process \[
y_t = 0.8y_{t-1} - 0.5y_{t-2} + \varepsilon_t,
\] where \(\varepsilon_t\) is white noise with error variance
\(\sigma^2 = 1\). After an appropriate burn-in period, we generate
\(N=5000\) data points. Under the sequential split and online learning
settings, we create training sets \(\mathcal{D}_{\text{tr}}\) and
calibration sets \(\mathcal{D}_{\text{cal}}\), each with a length of
\(500\). We use AR\((2)\) models to generate \(1\)- to \(3\)-step-ahead
point forecasts (i.e.~\(H=3\)), using the \texttt{Arima()} function from
the \texttt{forecast} R package \citep{hyndman2024}. The goal is to
generate prediction intervals using various proposed conformal
prediction methods and evaluate whether they can achieve the nominal
long-run coverage for each separate forecast horizon.
\begin{figure}
\centering{
\includegraphics{cpts_files/figure-pdf/fig-AR2_cov-1.pdf}
}
\caption{\label{fig-AR2_cov}AR(2) simulation results showing rolling
coverage, mean and median interval width for each forecast horizon. The