From 089bb01f9de4bee3421f21adf6c5a1b2f1715d48 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Fri, 26 Jul 2024 12:47:24 +0300
Subject: [PATCH 1/5] reactor: Export _polls counter (internally)

Will be useful for IO stall detector

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 include/seastar/core/reactor.hh | 2 ++
 1 file changed, 2 insertions(+)
diff --git a/include/seastar/core/reactor.hh b/include/seastar/core/reactor.hh
index e147d89711..44a1c3fa93 100644
--- a/include/seastar/core/reactor.hh
+++ b/include/seastar/core/reactor.hh
@@ -358,6 +358,8 @@ public:
     void wakeup();
     /// @private
     bool stopped() const noexcept { return _stopped; }
+    /// @private
+    uint64_t polls() const noexcept { return _polls; }
 
 private:
     class signals {

From 746b08d50820d208ff694ff505e1e46e03626d25 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Fri, 26 Jul 2024 12:38:43 +0300
Subject: [PATCH 2/5] io_queue: Rename flow ratio timer to be more generic

The timer is currently in charge of updating flow-ratio moving average.
It's going to do more "smoothing" updates, so name it respectively.

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 include/seastar/core/io_queue.hh | 5 +++--
 src/core/io_queue.cc             | 6 ++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/seastar/core/io_queue.hh b/include/seastar/core/io_queue.hh
index af6cda1c73..54c2245425 100644
--- a/include/seastar/core/io_queue.hh
+++ b/include/seastar/core/io_queue.hh
@@ -116,7 +116,8 @@ private:
     uint64_t _prev_dispatched = 0;
     uint64_t _prev_completed = 0;
     double _flow_ratio = 1.0;
-    timer<lowres_clock> _flow_ratio_update;
+
+    timer<lowres_clock> _averaging_decay_timer;
 
     void update_flow_ratio() noexcept;
 
@@ -148,7 +149,7 @@ public:
         bool duplex = false;
         std::chrono::duration<double> rate_limit_duration = std::chrono::milliseconds(1);
         size_t block_count_limit_min = 1;
-        unsigned flow_ratio_ticks = 100;
+        unsigned averaging_decay_ticks = 100;
         double flow_ratio_ema_factor = 0.95;
         double flow_ratio_backpressure_threshold = 1.1;
     };
diff --git a/src/core/io_queue.cc b/src/core/io_queue.cc
index 25f41a1ef5..64d2390561 100644
--- a/src/core/io_queue.cc
+++ b/src/core/io_queue.cc
@@ -562,7 +562,9 @@ io_queue::io_queue(io_group_ptr group, internal::io_sink& sink)
     : _priority_classes()
     , _group(std::move(group))
     , _sink(sink)
-    , _flow_ratio_update([this] { update_flow_ratio(); })
+    , _averaging_decay_timer([this] {
+        update_flow_ratio();
+    })
 {
     auto& cfg = get_config();
     if (cfg.duplex) {
@@ -573,7 +575,7 @@ io_queue::io_queue(io_group_ptr group, internal::io_sink& sink)
     } else {
         _streams.emplace_back(_group->_fgs[0], make_fair_queue_config(cfg, "rw"));
     }
-    _flow_ratio_update.arm_periodic(std::chrono::duration_cast<std::chrono::milliseconds>(_group->io_latency_goal() * cfg.flow_ratio_ticks));
+    _averaging_decay_timer.arm_periodic(std::chrono::duration_cast<std::chrono::milliseconds>(_group->io_latency_goal() * cfg.averaging_decay_ticks));
 
     namespace sm = seastar::metrics;
     auto owner_l = sm::shard_label(this_shard_id());

From 5030604dc3f747d1f4ff0f706259eaf8930b7d19 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Fri, 26 Jul 2024 12:49:09 +0300
Subject: [PATCH 3/5] io_queue: Keep local variable with request execution
 delay

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 src/core/io_queue.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/core/io_queue.cc b/src/core/io_queue.cc
index 64d2390561..b9eddaacc3 100644
--- a/src/core/io_queue.cc
+++ b/src/core/io_queue.cc
@@ -243,7 +243,8 @@ class io_desc_read_write final : public io_completion {
     virtual void complete(size_t res) noexcept override {
         io_log.trace("dev {} : req {} complete", _ioq.dev_id(), fmt::ptr(this));
         auto now = io_queue::clock_type::now();
-        _pclass.on_complete(std::chrono::duration_cast<std::chrono::duration<double>>(now - _ts));
+        auto delay = std::chrono::duration_cast<std::chrono::duration<double>>(now - _ts);
+        _pclass.on_complete(delay);
         _ioq.complete_request(*this);
         _pr.set_value(res);
         delete this;

From a5d6dede0d0d74dd8f93725217f28ddda6408eb1 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Fri, 26 Jul 2024 12:49:30 +0300
Subject: [PATCH 4/5] io_queue: Stall detector

Detect requests with execution delay larger than the configured
threshold, print warning into logs that includes the number of queued
and executing requests for the queue.

Other than that, include the number of polls that happened while the
request was in disk. For that, the executing request notices the
reactor::_polls count when it gets dispatched. This metrics should help
telling disk stalls from reactor misbehaviors. If the request wasn't
complete for too long due to disk/kernel problems, the polls count
difference should corresond to the observed delay: polls times latency
goal should be comparable to or greater than the request delay. Too low
polls count would mean that reactor failled to fetch the completion from
the kernel.

The threshold is doubled every time stall is detected to avoid storm of
warning. The averaging timer (introduced in previous patch) slowly
decreases the threshold back to keep catching "short" stalls. Initial
threshold is set to be 10x the io latency goal.

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 include/seastar/core/io_queue.hh |  7 ++++++-
 src/core/io_queue.cc             | 24 +++++++++++++++++++++---
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/include/seastar/core/io_queue.hh b/include/seastar/core/io_queue.hh
index 54c2245425..1d35676e0d 100644
--- a/include/seastar/core/io_queue.hh
+++ b/include/seastar/core/io_queue.hh
@@ -119,7 +119,11 @@ private:
 
     timer<lowres_clock> _averaging_decay_timer;
 
+    const std::chrono::milliseconds _stall_threshold_min;
+    std::chrono::milliseconds _stall_threshold;
+
     void update_flow_ratio() noexcept;
+    void lower_stall_threshold() noexcept;
 
     metrics::metric_groups _metric_groups;
 public:
@@ -152,6 +156,7 @@ public:
         unsigned averaging_decay_ticks = 100;
         double flow_ratio_ema_factor = 0.95;
         double flow_ratio_backpressure_threshold = 1.1;
+        std::chrono::milliseconds stall_threshold = std::chrono::milliseconds(100);
     };
 
     io_queue(io_group_ptr group, internal::io_sink& sink);
@@ -167,7 +172,7 @@ public:
     void submit_request(io_desc_read_write* desc, internal::io_request req) noexcept;
     void cancel_request(queued_io_request& req) noexcept;
     void complete_cancelled_request(queued_io_request& req) noexcept;
-    void complete_request(io_desc_read_write& desc) noexcept;
+    void complete_request(io_desc_read_write& desc, std::chrono::duration<double> delay) noexcept;
 
     [[deprecated("I/O queue users should not track individual requests, but resources (weight, size) passing through the queue")]]
     size_t queued_requests() const {
diff --git a/src/core/io_queue.cc b/src/core/io_queue.cc
index b9eddaacc3..8047e107eb 100644
--- a/src/core/io_queue.cc
+++ b/src/core/io_queue.cc
@@ -218,6 +218,7 @@ class io_desc_read_write final : public io_completion {
     const fair_queue_entry::capacity_t _fq_capacity;
     promise<size_t> _pr;
     iovec_keeper _iovs;
+    uint64_t _dispatched_polls;
 
 public:
     io_desc_read_write(io_queue& ioq, io_queue::priority_class_data& pc, stream_id stream, io_direction_and_length dnl, fair_queue_entry::capacity_t cap, iovec_keeper iovs)
@@ -235,7 +236,7 @@ class io_desc_read_write final : public io_completion {
     virtual void set_exception(std::exception_ptr eptr) noexcept override {
         io_log.trace("dev {} : req {} error", _ioq.dev_id(), fmt::ptr(this));
         _pclass.on_error();
-        _ioq.complete_request(*this);
+        _ioq.complete_request(*this, std::chrono::duration<double>(0.0));
         _pr.set_exception(eptr);
         delete this;
     }
@@ -245,7 +246,7 @@ class io_desc_read_write final : public io_completion {
         auto now = io_queue::clock_type::now();
         auto delay = std::chrono::duration_cast<std::chrono::duration<double>>(now - _ts);
         _pclass.on_complete(delay);
-        _ioq.complete_request(*this);
+        _ioq.complete_request(*this, delay);
         _pr.set_value(res);
         delete this;
     }
@@ -261,6 +262,7 @@ class io_desc_read_write final : public io_completion {
         auto now = io_queue::clock_type::now();
         _pclass.on_dispatch(_dnl, std::chrono::duration_cast<std::chrono::duration<double>>(now - _ts));
         _ts = now;
+        _dispatched_polls = engine().polls();
     }
 
     future<size_t> get_future() {
@@ -269,6 +271,7 @@ class io_desc_read_write final : public io_completion {
 
     fair_queue_entry::capacity_t capacity() const noexcept { return _fq_capacity; }
     stream_id stream() const noexcept { return _stream; }
+    uint64_t polls() const noexcept { return _dispatched_polls; }
 };
 
 class queued_io_request : private internal::io_request {
@@ -546,11 +549,23 @@ void io_queue::update_flow_ratio() noexcept {
     }
 }
 
+void io_queue::lower_stall_threshold() noexcept {
+    auto new_threshold = _stall_threshold - std::chrono::milliseconds(1);
+    _stall_threshold = std::max(_stall_threshold_min, new_threshold);
+}
+
 void
-io_queue::complete_request(io_desc_read_write& desc) noexcept {
+io_queue::complete_request(io_desc_read_write& desc, std::chrono::duration<double> delay) noexcept {
     _requests_executing--;
     _requests_completed++;
     _streams[desc.stream()].notify_request_finished(desc.capacity());
+
+    if (delay > _stall_threshold) {
+        _stall_threshold *= 2;
+        io_log.warn("Request took {:.3f}ms ({} polls) to execute, queued {} executing {}",
+            std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(delay).count(),
+            engine().polls() - desc.polls(), _queued_requests, _requests_executing);
+    }
 }
 
 fair_queue::config io_queue::make_fair_queue_config(const config& iocfg, sstring label) {
@@ -565,7 +580,10 @@ io_queue::io_queue(io_group_ptr group, internal::io_sink& sink)
     , _sink(sink)
     , _averaging_decay_timer([this] {
         update_flow_ratio();
+        lower_stall_threshold();
     })
+    , _stall_threshold_min(std::max(get_config().stall_threshold, 1ms))
+    , _stall_threshold(_stall_threshold_min)
 {
     auto& cfg = get_config();
     if (cfg.duplex) {

From 7911006e58a0f13f888b58046f2c7d67bab9ef73 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Wed, 7 Aug 2024 10:11:49 +0300
Subject: [PATCH 5/5] reactor: Add --io-completion-notify-ms option

The one controls IO stall detector minimal threshold. By default it's
set to maximum thus turning the detector off.

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 include/seastar/core/reactor_config.hh | 5 +++++
 src/core/reactor.cc                    | 8 ++++++++
 2 files changed, 13 insertions(+)

diff --git a/include/seastar/core/reactor_config.hh b/include/seastar/core/reactor_config.hh
index 9af4c77bcb..77f174179e 100644
--- a/include/seastar/core/reactor_config.hh
+++ b/include/seastar/core/reactor_config.hh
@@ -85,6 +85,11 @@ struct reactor_options : public program_options::option_group {
     ///
     /// Default: 1.1
     program_options::value<double> io_flow_ratio_threshold;
+    /// \brief If an IO request is executed longer than that, this is printed to
+    /// logs with extra debugging
+    ///
+    /// Default: infinite (detection is OFF)
+    program_options::value<unsigned> io_completion_notify_ms;
     /// \brief Maximum number of task backlog to allow.
     ///
     /// When the number of tasks grow above this, we stop polling (e.g. I/O)
diff --git a/src/core/reactor.cc b/src/core/reactor.cc
index d3230af799..d3861c4b09 100644
--- a/src/core/reactor.cc
+++ b/src/core/reactor.cc
@@ -3813,6 +3813,7 @@ reactor_options::reactor_options(program_options::option_group* parent_group)
     , task_quota_ms(*this, "task-quota-ms", 0.5, "Max time (ms) between polls")
     , io_latency_goal_ms(*this, "io-latency-goal-ms", {}, "Max time (ms) io operations must take (1.5 * task-quota-ms if not set)")
     , io_flow_ratio_threshold(*this, "io-flow-rate-threshold", 1.1, "Dispatch rate to completion rate threshold")
+    , io_completion_notify_ms(*this, "io-completion-notify-ms", {}, "Threshold in milliseconds over which IO request completion is reported to logs")
     , max_task_backlog(*this, "max-task-backlog", 1000, "Maximum number of task backlog to allow; above this we ignore I/O")
     , blocked_reactor_notify_ms(*this, "blocked-reactor-notify-ms", 25, "threshold in miliseconds over which the reactor is considered blocked if no progress is made")
     , blocked_reactor_reports_per_minute(*this, "blocked-reactor-reports-per-minute", 5, "Maximum number of backtraces reported by stall detector per minute")
@@ -4046,6 +4047,7 @@ class disk_config_params {
     unsigned _num_io_groups = 0;
     std::unordered_map<dev_t, mountpoint_params> _mountpoints;
     std::chrono::duration<double> _latency_goal;
+    std::chrono::milliseconds _stall_threshold;
     double _flow_ratio_backpressure_threshold;
 
 public:
@@ -4063,6 +4065,10 @@ class disk_config_params {
         return _latency_goal;
     }
 
+    std::chrono::milliseconds stall_threshold() const {
+        return _stall_threshold;
+    }
+
     double latency_goal_opt(const reactor_options& opts) const {
         return opts.io_latency_goal_ms ?
                 opts.io_latency_goal_ms.get_value() :
@@ -4075,6 +4081,7 @@ class disk_config_params {
         seastar_logger.debug("latency_goal: {}", latency_goal().count());
         _flow_ratio_backpressure_threshold = reactor_opts.io_flow_ratio_threshold.get_value();
         seastar_logger.debug("flow-ratio threshold: {}", _flow_ratio_backpressure_threshold);
+        _stall_threshold = reactor_opts.io_completion_notify_ms.defaulted() ? std::chrono::milliseconds::max() : reactor_opts.io_completion_notify_ms.get_value() * 1ms;
 
         if (smp_opts.num_io_groups) {
             _num_io_groups = smp_opts.num_io_groups.get_value();
@@ -4170,6 +4177,7 @@ class disk_config_params {
         // scheduler will self-tune to allow for the single 64k request, while it would
         // be better to sacrifice some IO latency, but allow for larger concurrency
         cfg.block_count_limit_min = (64 << 10) >> io_queue::block_size_shift;
+        cfg.stall_threshold = stall_threshold();
 
         return cfg;
     }