From 089bb01f9de4bee3421f21adf6c5a1b2f1715d48 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 26 Jul 2024 12:47:24 +0300 Subject: [PATCH 1/5] reactor: Export _polls counter (internally) Will be useful for IO stall detector Signed-off-by: Pavel Emelyanov --- include/seastar/core/reactor.hh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/seastar/core/reactor.hh b/include/seastar/core/reactor.hh index e147d89711..44a1c3fa93 100644 --- a/include/seastar/core/reactor.hh +++ b/include/seastar/core/reactor.hh @@ -358,6 +358,8 @@ public: void wakeup(); /// @private bool stopped() const noexcept { return _stopped; } + /// @private + uint64_t polls() const noexcept { return _polls; } private: class signals { From 746b08d50820d208ff694ff505e1e46e03626d25 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 26 Jul 2024 12:38:43 +0300 Subject: [PATCH 2/5] io_queue: Rename flow ratio timer to be more generic The timer is currently in charge of updating flow-ratio moving average. It's going to do more "smoothing" updates, so name it respectively. Signed-off-by: Pavel Emelyanov --- include/seastar/core/io_queue.hh | 5 +++-- src/core/io_queue.cc | 6 ++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/include/seastar/core/io_queue.hh b/include/seastar/core/io_queue.hh index af6cda1c73..54c2245425 100644 --- a/include/seastar/core/io_queue.hh +++ b/include/seastar/core/io_queue.hh @@ -116,7 +116,8 @@ private: uint64_t _prev_dispatched = 0; uint64_t _prev_completed = 0; double _flow_ratio = 1.0; - timer _flow_ratio_update; + + timer _averaging_decay_timer; void update_flow_ratio() noexcept; @@ -148,7 +149,7 @@ public: bool duplex = false; std::chrono::duration rate_limit_duration = std::chrono::milliseconds(1); size_t block_count_limit_min = 1; - unsigned flow_ratio_ticks = 100; + unsigned averaging_decay_ticks = 100; double flow_ratio_ema_factor = 0.95; double flow_ratio_backpressure_threshold = 1.1; }; diff --git a/src/core/io_queue.cc b/src/core/io_queue.cc index 25f41a1ef5..64d2390561 100644 --- a/src/core/io_queue.cc +++ b/src/core/io_queue.cc @@ -562,7 +562,9 @@ io_queue::io_queue(io_group_ptr group, internal::io_sink& sink) : _priority_classes() , _group(std::move(group)) , _sink(sink) - , _flow_ratio_update([this] { update_flow_ratio(); }) + , _averaging_decay_timer([this] { + update_flow_ratio(); + }) { auto& cfg = get_config(); if (cfg.duplex) { @@ -573,7 +575,7 @@ io_queue::io_queue(io_group_ptr group, internal::io_sink& sink) } else { _streams.emplace_back(_group->_fgs[0], make_fair_queue_config(cfg, "rw")); } - _flow_ratio_update.arm_periodic(std::chrono::duration_cast(_group->io_latency_goal() * cfg.flow_ratio_ticks)); + _averaging_decay_timer.arm_periodic(std::chrono::duration_cast(_group->io_latency_goal() * cfg.averaging_decay_ticks)); namespace sm = seastar::metrics; auto owner_l = sm::shard_label(this_shard_id()); From 5030604dc3f747d1f4ff0f706259eaf8930b7d19 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 26 Jul 2024 12:49:09 +0300 Subject: [PATCH 3/5] io_queue: Keep local variable with request execution delay Signed-off-by: Pavel Emelyanov --- src/core/io_queue.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/core/io_queue.cc b/src/core/io_queue.cc index 64d2390561..b9eddaacc3 100644 --- a/src/core/io_queue.cc +++ b/src/core/io_queue.cc @@ -243,7 +243,8 @@ class io_desc_read_write final : public io_completion { virtual void complete(size_t res) noexcept override { io_log.trace("dev {} : req {} complete", _ioq.dev_id(), fmt::ptr(this)); auto now = io_queue::clock_type::now(); - _pclass.on_complete(std::chrono::duration_cast>(now - _ts)); + auto delay = std::chrono::duration_cast>(now - _ts); + _pclass.on_complete(delay); _ioq.complete_request(*this); _pr.set_value(res); delete this; From a5d6dede0d0d74dd8f93725217f28ddda6408eb1 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 26 Jul 2024 12:49:30 +0300 Subject: [PATCH 4/5] io_queue: Stall detector Detect requests with execution delay larger than the configured threshold, print warning into logs that includes the number of queued and executing requests for the queue. Other than that, include the number of polls that happened while the request was in disk. For that, the executing request notices the reactor::_polls count when it gets dispatched. This metrics should help telling disk stalls from reactor misbehaviors. If the request wasn't complete for too long due to disk/kernel problems, the polls count difference should corresond to the observed delay: polls times latency goal should be comparable to or greater than the request delay. Too low polls count would mean that reactor failled to fetch the completion from the kernel. The threshold is doubled every time stall is detected to avoid storm of warning. The averaging timer (introduced in previous patch) slowly decreases the threshold back to keep catching "short" stalls. Initial threshold is set to be 10x the io latency goal. Signed-off-by: Pavel Emelyanov --- include/seastar/core/io_queue.hh | 7 ++++++- src/core/io_queue.cc | 24 +++++++++++++++++++++--- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/include/seastar/core/io_queue.hh b/include/seastar/core/io_queue.hh index 54c2245425..1d35676e0d 100644 --- a/include/seastar/core/io_queue.hh +++ b/include/seastar/core/io_queue.hh @@ -119,7 +119,11 @@ private: timer _averaging_decay_timer; + const std::chrono::milliseconds _stall_threshold_min; + std::chrono::milliseconds _stall_threshold; + void update_flow_ratio() noexcept; + void lower_stall_threshold() noexcept; metrics::metric_groups _metric_groups; public: @@ -152,6 +156,7 @@ public: unsigned averaging_decay_ticks = 100; double flow_ratio_ema_factor = 0.95; double flow_ratio_backpressure_threshold = 1.1; + std::chrono::milliseconds stall_threshold = std::chrono::milliseconds(100); }; io_queue(io_group_ptr group, internal::io_sink& sink); @@ -167,7 +172,7 @@ public: void submit_request(io_desc_read_write* desc, internal::io_request req) noexcept; void cancel_request(queued_io_request& req) noexcept; void complete_cancelled_request(queued_io_request& req) noexcept; - void complete_request(io_desc_read_write& desc) noexcept; + void complete_request(io_desc_read_write& desc, std::chrono::duration delay) noexcept; [[deprecated("I/O queue users should not track individual requests, but resources (weight, size) passing through the queue")]] size_t queued_requests() const { diff --git a/src/core/io_queue.cc b/src/core/io_queue.cc index b9eddaacc3..8047e107eb 100644 --- a/src/core/io_queue.cc +++ b/src/core/io_queue.cc @@ -218,6 +218,7 @@ class io_desc_read_write final : public io_completion { const fair_queue_entry::capacity_t _fq_capacity; promise _pr; iovec_keeper _iovs; + uint64_t _dispatched_polls; public: io_desc_read_write(io_queue& ioq, io_queue::priority_class_data& pc, stream_id stream, io_direction_and_length dnl, fair_queue_entry::capacity_t cap, iovec_keeper iovs) @@ -235,7 +236,7 @@ class io_desc_read_write final : public io_completion { virtual void set_exception(std::exception_ptr eptr) noexcept override { io_log.trace("dev {} : req {} error", _ioq.dev_id(), fmt::ptr(this)); _pclass.on_error(); - _ioq.complete_request(*this); + _ioq.complete_request(*this, std::chrono::duration(0.0)); _pr.set_exception(eptr); delete this; } @@ -245,7 +246,7 @@ class io_desc_read_write final : public io_completion { auto now = io_queue::clock_type::now(); auto delay = std::chrono::duration_cast>(now - _ts); _pclass.on_complete(delay); - _ioq.complete_request(*this); + _ioq.complete_request(*this, delay); _pr.set_value(res); delete this; } @@ -261,6 +262,7 @@ class io_desc_read_write final : public io_completion { auto now = io_queue::clock_type::now(); _pclass.on_dispatch(_dnl, std::chrono::duration_cast>(now - _ts)); _ts = now; + _dispatched_polls = engine().polls(); } future get_future() { @@ -269,6 +271,7 @@ class io_desc_read_write final : public io_completion { fair_queue_entry::capacity_t capacity() const noexcept { return _fq_capacity; } stream_id stream() const noexcept { return _stream; } + uint64_t polls() const noexcept { return _dispatched_polls; } }; class queued_io_request : private internal::io_request { @@ -546,11 +549,23 @@ void io_queue::update_flow_ratio() noexcept { } } +void io_queue::lower_stall_threshold() noexcept { + auto new_threshold = _stall_threshold - std::chrono::milliseconds(1); + _stall_threshold = std::max(_stall_threshold_min, new_threshold); +} + void -io_queue::complete_request(io_desc_read_write& desc) noexcept { +io_queue::complete_request(io_desc_read_write& desc, std::chrono::duration delay) noexcept { _requests_executing--; _requests_completed++; _streams[desc.stream()].notify_request_finished(desc.capacity()); + + if (delay > _stall_threshold) { + _stall_threshold *= 2; + io_log.warn("Request took {:.3f}ms ({} polls) to execute, queued {} executing {}", + std::chrono::duration_cast>(delay).count(), + engine().polls() - desc.polls(), _queued_requests, _requests_executing); + } } fair_queue::config io_queue::make_fair_queue_config(const config& iocfg, sstring label) { @@ -565,7 +580,10 @@ io_queue::io_queue(io_group_ptr group, internal::io_sink& sink) , _sink(sink) , _averaging_decay_timer([this] { update_flow_ratio(); + lower_stall_threshold(); }) + , _stall_threshold_min(std::max(get_config().stall_threshold, 1ms)) + , _stall_threshold(_stall_threshold_min) { auto& cfg = get_config(); if (cfg.duplex) { From 7911006e58a0f13f888b58046f2c7d67bab9ef73 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 7 Aug 2024 10:11:49 +0300 Subject: [PATCH 5/5] reactor: Add --io-completion-notify-ms option The one controls IO stall detector minimal threshold. By default it's set to maximum thus turning the detector off. Signed-off-by: Pavel Emelyanov --- include/seastar/core/reactor_config.hh | 5 +++++ src/core/reactor.cc | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/include/seastar/core/reactor_config.hh b/include/seastar/core/reactor_config.hh index 9af4c77bcb..77f174179e 100644 --- a/include/seastar/core/reactor_config.hh +++ b/include/seastar/core/reactor_config.hh @@ -85,6 +85,11 @@ struct reactor_options : public program_options::option_group { /// /// Default: 1.1 program_options::value io_flow_ratio_threshold; + /// \brief If an IO request is executed longer than that, this is printed to + /// logs with extra debugging + /// + /// Default: infinite (detection is OFF) + program_options::value io_completion_notify_ms; /// \brief Maximum number of task backlog to allow. /// /// When the number of tasks grow above this, we stop polling (e.g. I/O) diff --git a/src/core/reactor.cc b/src/core/reactor.cc index d3230af799..d3861c4b09 100644 --- a/src/core/reactor.cc +++ b/src/core/reactor.cc @@ -3813,6 +3813,7 @@ reactor_options::reactor_options(program_options::option_group* parent_group) , task_quota_ms(*this, "task-quota-ms", 0.5, "Max time (ms) between polls") , io_latency_goal_ms(*this, "io-latency-goal-ms", {}, "Max time (ms) io operations must take (1.5 * task-quota-ms if not set)") , io_flow_ratio_threshold(*this, "io-flow-rate-threshold", 1.1, "Dispatch rate to completion rate threshold") + , io_completion_notify_ms(*this, "io-completion-notify-ms", {}, "Threshold in milliseconds over which IO request completion is reported to logs") , max_task_backlog(*this, "max-task-backlog", 1000, "Maximum number of task backlog to allow; above this we ignore I/O") , blocked_reactor_notify_ms(*this, "blocked-reactor-notify-ms", 25, "threshold in miliseconds over which the reactor is considered blocked if no progress is made") , blocked_reactor_reports_per_minute(*this, "blocked-reactor-reports-per-minute", 5, "Maximum number of backtraces reported by stall detector per minute") @@ -4046,6 +4047,7 @@ class disk_config_params { unsigned _num_io_groups = 0; std::unordered_map _mountpoints; std::chrono::duration _latency_goal; + std::chrono::milliseconds _stall_threshold; double _flow_ratio_backpressure_threshold; public: @@ -4063,6 +4065,10 @@ class disk_config_params { return _latency_goal; } + std::chrono::milliseconds stall_threshold() const { + return _stall_threshold; + } + double latency_goal_opt(const reactor_options& opts) const { return opts.io_latency_goal_ms ? opts.io_latency_goal_ms.get_value() : @@ -4075,6 +4081,7 @@ class disk_config_params { seastar_logger.debug("latency_goal: {}", latency_goal().count()); _flow_ratio_backpressure_threshold = reactor_opts.io_flow_ratio_threshold.get_value(); seastar_logger.debug("flow-ratio threshold: {}", _flow_ratio_backpressure_threshold); + _stall_threshold = reactor_opts.io_completion_notify_ms.defaulted() ? std::chrono::milliseconds::max() : reactor_opts.io_completion_notify_ms.get_value() * 1ms; if (smp_opts.num_io_groups) { _num_io_groups = smp_opts.num_io_groups.get_value(); @@ -4170,6 +4177,7 @@ class disk_config_params { // scheduler will self-tune to allow for the single 64k request, while it would // be better to sacrifice some IO latency, but allow for larger concurrency cfg.block_count_limit_min = (64 << 10) >> io_queue::block_size_shift; + cfg.stall_threshold = stall_threshold(); return cfg; }