From a549e294e6f880e86e5a7571477d6a62e764f299 Mon Sep 17 00:00:00 2001
From: ferreum <code@ferreum.de>
Date: Sun, 30 Jul 2023 15:28:12 +0200
Subject: [PATCH] af_scaletempo2: fix audio-video de-sync caused by speed
 changes

Fixes #12028, audio-video desynchronization caused by changing speed.

There was an additional issue that audio was always delayed by half the
configured search-interval. Include `ola_hop_size` in the delay
to compensate for that.

Notes:
- Every WSOLA iteration advances the input buffer by _some amount_, and
  produces data in the output buffer always of size `ola_hop_size`.
- `mp_scaletempo2_fill_buffer` is always called with `ola_hop_size`
- Thus, the rendered frames are always cleared immediately after processing,
  and `num_complete_frames` is 0 in the delay calculation.
- The input buffer expression makes sense as the header comment states,
  "target_block is the 'natural' continuation of the output". The delay
  comes from the length of audio that the filter is holding back.
- The factors contributing to delay are:
  - the pending samples in the input buffer,
  - the pending rendered samples in the output buffer, and
  - an amount of `ola_hop_size`

The frame_delay code looked like that of the rubberband filter, which
might not work for scaletempo2. Sometimes a different amount of input
audio was consumed by scaletempo2 than expected. It may have been caused
by speed changes being a more dynamic process in scaletempo2. This can
be seen by where `playback_rate` is used in `run_one_wsola_iteration`:
`playback_rate` is only referenced after the iteration, when updating
the time and removing old data from buffers.

In scaletempo2, the playback speed is applied by changing the amount the
search window is moved. That apparently averages out correctly at
constant playback speed, but when the speed changes, the error in this
assumption probably spikes. This error accumulated across all speed
changes because of the persistent `frame_delay` value.

With the removal of the persistent `frame_delay`, there should be no way
for the audio to drift off. By deriving the delay from filter buffer
positions, and the buffers are filled only as much as needed, the delay
always stays within buffer bounds.
---
 audio/filter/af_scaletempo2.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/audio/filter/af_scaletempo2.c b/audio/filter/af_scaletempo2.c
index 1a822ecd50766..cce33c2898d06 100644
--- a/audio/filter/af_scaletempo2.c
+++ b/audio/filter/af_scaletempo2.c
@@ -15,7 +15,6 @@ struct priv {
     bool sent_final;
     struct mp_aframe *pending;
     bool initialized;
-    double frame_delay;
     float speed;
 };
 
@@ -67,7 +66,6 @@ static void process(struct mp_filter *f)
             uint8_t **planes = mp_aframe_get_data_ro(p->pending);
             int read = mp_scaletempo2_fill_input_buffer(&p->data,
                 planes, frame_size, final);
-            p->frame_delay += read;
             mp_aframe_skip_samples(p->pending, read);
         }
         p->sent_final |= final;
@@ -109,11 +107,11 @@ static void process(struct mp_filter *f)
             (float**)planes, out_samples, p->speed);
 
         double pts = mp_aframe_get_pts(p->pending);
-        p->frame_delay -= out_samples * p->speed;
-
         if (pts != MP_NOPTS_VALUE) {
-            double delay = p->frame_delay / mp_aframe_get_effective_rate(out);
-            mp_aframe_set_pts(out, pts - delay);
+            double frame_delay = p->data.input_buffer_frames - p->data.target_block_index
+                                 + p->data.num_complete_frames * p->speed
+                                 + p->data.ola_hop_size * p->speed;
+            mp_aframe_set_pts(out, pts - frame_delay / mp_aframe_get_effective_rate(out));
         }
 
         mp_aframe_set_size(out, out_samples);
@@ -137,7 +135,6 @@ static bool init_scaletempo2(struct mp_filter *f)
     mp_aframe_reset(p->cur_format);
     p->initialized = true;
     p->sent_final = false;
-    p->frame_delay = 0;
     mp_aframe_config_copy(p->cur_format, p->pending);
 
     mp_scaletempo2_init(&p->data, mp_aframe_get_channels(p->pending),
@@ -163,7 +160,6 @@ static void reset(struct mp_filter *f)
 {
     struct priv *p = f->priv;
     mp_scaletempo2_reset(&p->data);
-    p->frame_delay = 0;
     p->initialized = false;
     TA_FREEP(&p->pending);
 }