diff --git a/api/envoy/config/cluster/v3/cluster.proto b/api/envoy/config/cluster/v3/cluster.proto index ef6bbe121b4c..0e034b3dde44 100644 --- a/api/envoy/config/cluster/v3/cluster.proto +++ b/api/envoy/config/cluster/v3/cluster.proto @@ -45,7 +45,7 @@ message ClusterCollection { } // Configuration for a single upstream cluster. -// [#next-free-field: 58] +// [#next-free-field: 59] message Cluster { option (udpa.annotations.versioning).previous_message_type = "envoy.api.v2.Cluster"; @@ -956,6 +956,17 @@ message Cluster { google.protobuf.Duration dns_refresh_rate = 16 [(validate.rules).duration = {gt {nanos: 1000000}}]; + // DNS jitter can be optionally specified if the cluster type is either + // :ref:`STRICT_DNS`, + // or :ref:`LOGICAL_DNS`. + // DNS jitter causes the cluster to refresh DNS entries later by a random amount of time to avoid a + // stampede of DNS requests. This value sets the upper bound (exclusive) for the random amount. + // There will be no jitter if this value is omitted. For cluster types other than + // :ref:`STRICT_DNS` + // and :ref:`LOGICAL_DNS` + // this setting is ignored. + google.protobuf.Duration dns_jitter = 58; + // If the DNS failure refresh rate is specified and the cluster type is either // :ref:`STRICT_DNS`, // or :ref:`LOGICAL_DNS`, diff --git a/changelogs/current.yaml b/changelogs/current.yaml index 571570861bdc..33d669ab5915 100644 --- a/changelogs/current.yaml +++ b/changelogs/current.yaml @@ -159,6 +159,13 @@ removed_config_or_runtime: Removed ``envoy.reloadable_features.normalize_host_for_preresolve_dfp_dns`` runtime flag and legacy code paths. new_features: +- area: dns + change: | + for the :ref:`strict DNS ` and :ref:`logical DNS + ` cluster types, + the new :ref:`dns_jitter ` field, if + provided, will causes the cluster to refresh DNS entries later by a random amount of time as to + avoid stampedes of DNS requests. This field sets the upper bound (exclusive) for the random amount. - area: redis change: | Added support for publish. diff --git a/source/extensions/clusters/logical_dns/logical_dns_cluster.cc b/source/extensions/clusters/logical_dns/logical_dns_cluster.cc index 927a49804c1f..a29ac2194940 100644 --- a/source/extensions/clusters/logical_dns/logical_dns_cluster.cc +++ b/source/extensions/clusters/logical_dns/logical_dns_cluster.cc @@ -51,6 +51,7 @@ LogicalDnsCluster::LogicalDnsCluster(const envoy::config::cluster::v3::Cluster& : ClusterImplBase(cluster, context, creation_status), dns_resolver_(dns_resolver), dns_refresh_rate_ms_( std::chrono::milliseconds(PROTOBUF_GET_MS_OR_DEFAULT(cluster, dns_refresh_rate, 5000))), + dns_jitter_ms_(std::chrono::milliseconds(PROTOBUF_GET_MS_OR_DEFAULT(cluster, dns_jitter, 0))), respect_dns_ttl_(cluster.respect_dns_ttl()), resolve_timer_(context.serverFactoryContext().mainThreadDispatcher().createTimer( [this]() -> void { startResolve(); })), @@ -149,6 +150,9 @@ void LogicalDnsCluster::startResolve() { if (respect_dns_ttl_ && addrinfo.ttl_ != std::chrono::seconds(0)) { final_refresh_rate = addrinfo.ttl_; } + if (dns_jitter_ms_.count() != 0) { + final_refresh_rate += std::chrono::milliseconds(random_.random()) % dns_jitter_ms_; + } ENVOY_LOG(debug, "DNS refresh rate reset for {}, refresh rate {} ms", dns_address_, final_refresh_rate.count()); } else { diff --git a/source/extensions/clusters/logical_dns/logical_dns_cluster.h b/source/extensions/clusters/logical_dns/logical_dns_cluster.h index b94d551b240b..8522af08edd8 100644 --- a/source/extensions/clusters/logical_dns/logical_dns_cluster.h +++ b/source/extensions/clusters/logical_dns/logical_dns_cluster.h @@ -69,6 +69,7 @@ class LogicalDnsCluster : public ClusterImplBase { Network::DnsResolverSharedPtr dns_resolver_; const std::chrono::milliseconds dns_refresh_rate_ms_; + const std::chrono::milliseconds dns_jitter_ms_; BackOffStrategyPtr failure_backoff_strategy_; const bool respect_dns_ttl_; Network::DnsLookupFamily dns_lookup_family_; diff --git a/source/extensions/clusters/strict_dns/strict_dns_cluster.cc b/source/extensions/clusters/strict_dns/strict_dns_cluster.cc index a052fd9cb3df..d2b1794ec57c 100644 --- a/source/extensions/clusters/strict_dns/strict_dns_cluster.cc +++ b/source/extensions/clusters/strict_dns/strict_dns_cluster.cc @@ -1,5 +1,7 @@ #include "source/extensions/clusters/strict_dns/strict_dns_cluster.h" +#include + #include "envoy/common/exception.h" #include "envoy/config/cluster/v3/cluster.pb.h" #include "envoy/config/endpoint/v3/endpoint.pb.h" @@ -29,6 +31,7 @@ StrictDnsClusterImpl::StrictDnsClusterImpl(const envoy::config::cluster::v3::Clu local_info_(context.serverFactoryContext().localInfo()), dns_resolver_(dns_resolver), dns_refresh_rate_ms_( std::chrono::milliseconds(PROTOBUF_GET_MS_OR_DEFAULT(cluster, dns_refresh_rate, 5000))), + dns_jitter_ms_(PROTOBUF_GET_MS_OR_DEFAULT(cluster, dns_jitter, 0)), respect_dns_ttl_(cluster.respect_dns_ttl()) { failure_backoff_strategy_ = Config::Utility::prepareDnsRefreshStrategy( @@ -189,6 +192,11 @@ void StrictDnsClusterImpl::ResolveTarget::startResolve() { ASSERT(ttl_refresh_rate != std::chrono::seconds::max() && final_refresh_rate.count() > 0); } + if (parent_.dns_jitter_ms_.count() > 0) { + final_refresh_rate += + std::chrono::milliseconds(parent_.random_.random()) % parent_.dns_jitter_ms_; + } + ENVOY_LOG(debug, "DNS refresh rate reset for {}, refresh rate {} ms", dns_address_, final_refresh_rate.count()); } else { diff --git a/source/extensions/clusters/strict_dns/strict_dns_cluster.h b/source/extensions/clusters/strict_dns/strict_dns_cluster.h index 6d8cd9f3923d..72b4070e5a85 100644 --- a/source/extensions/clusters/strict_dns/strict_dns_cluster.h +++ b/source/extensions/clusters/strict_dns/strict_dns_cluster.h @@ -69,6 +69,7 @@ class StrictDnsClusterImpl : public BaseDynamicClusterImpl { Network::DnsResolverSharedPtr dns_resolver_; std::list resolve_targets_; const std::chrono::milliseconds dns_refresh_rate_ms_; + const std::chrono::milliseconds dns_jitter_ms_; BackOffStrategyPtr failure_backoff_strategy_; const bool respect_dns_ttl_; Network::DnsLookupFamily dns_lookup_family_; diff --git a/test/common/upstream/upstream_impl_test.cc b/test/common/upstream/upstream_impl_test.cc index b0d5217b2bb2..b7c23e96f2ae 100644 --- a/test/common/upstream/upstream_impl_test.cc +++ b/test/common/upstream/upstream_impl_test.cc @@ -1432,7 +1432,7 @@ TEST_F(StrictDnsClusterImplTest, FailureRefreshRateBackoffResetsWhenSuccessHappe TestUtility::makeDnsResponse({})); } -TEST_F(StrictDnsClusterImplTest, TtlAsDnsRefreshRate) { +TEST_F(StrictDnsClusterImplTest, TtlAsDnsRefreshRateNoJitter) { ResolverData resolver(*dns_resolver_, server_context_.dispatcher_); const std::string yaml = R"EOF( @@ -1488,6 +1488,49 @@ TEST_F(StrictDnsClusterImplTest, TtlAsDnsRefreshRate) { TestUtility::makeDnsResponse({}, std::chrono::seconds(5))); } +TEST_F(StrictDnsClusterImplTest, TtlAsDnsRefreshRateYesJitter) { + ResolverData resolver(*dns_resolver_, server_context_.dispatcher_); + + const std::string yaml = R"EOF( + name: name + connect_timeout: 0.25s + type: STRICT_DNS + lb_policy: ROUND_ROBIN + dns_refresh_rate: 4s + dns_jitter: 1s + respect_dns_ttl: true + load_assignment: + endpoints: + - lb_endpoints: + - endpoint: + address: + socket_address: + address: localhost1 + port_value: 11001 + )EOF"; + + envoy::config::cluster::v3::Cluster cluster_config = parseClusterFromV3Yaml(yaml); + + Envoy::Upstream::ClusterFactoryContextImpl factory_context( + server_context_, server_context_.cluster_manager_, nullptr, ssl_context_manager_, nullptr, + false); + + auto cluster = *StrictDnsClusterImpl::create(cluster_config, factory_context, dns_resolver_); + + cluster->initialize([] {}); + + uint64_t random_return = 8500; + uint64_t jitter_ms = random_return % 1000; + uint64_t ttl_s = 6; + + EXPECT_CALL(*resolver.timer_, + enableTimer(std::chrono::milliseconds(ttl_s * 1000 + jitter_ms), _)); + ON_CALL(random_, random()).WillByDefault(Return(random_return)); + resolver.dns_callback_( + Network::DnsResolver::ResolutionStatus::Success, "", + TestUtility::makeDnsResponse({"192.168.1.1", "192.168.1.2"}, std::chrono::seconds(ttl_s))); +} + // Ensures that HTTP/2 user defined SETTINGS parameter validation is enforced on clusters. TEST_F(StrictDnsClusterImplTest, Http2UserDefinedSettingsParametersValidation) { const std::string yaml = R"EOF( diff --git a/test/extensions/clusters/logical_dns/BUILD b/test/extensions/clusters/logical_dns/BUILD index c5d3e64052c6..a2d92aa47e8f 100644 --- a/test/extensions/clusters/logical_dns/BUILD +++ b/test/extensions/clusters/logical_dns/BUILD @@ -12,6 +12,7 @@ envoy_cc_test( name = "logical_dns_cluster_test", srcs = ["logical_dns_cluster_test.cc"], deps = [ + "//source/common/common:random_generator_lib", "//source/common/event:dispatcher_lib", "//source/common/network:utility_lib", "//source/common/upstream:upstream_lib", diff --git a/test/extensions/clusters/logical_dns/logical_dns_cluster_test.cc b/test/extensions/clusters/logical_dns/logical_dns_cluster_test.cc index 65de11b49f41..b781b28e2697 100644 --- a/test/extensions/clusters/logical_dns/logical_dns_cluster_test.cc +++ b/test/extensions/clusters/logical_dns/logical_dns_cluster_test.cc @@ -607,6 +607,46 @@ TEST_F(LogicalDnsClusterTest, DontWaitForDNSOnInit) { TestUtility::makeDnsResponse({"127.0.0.1", "127.0.0.2"})); } +TEST_F(LogicalDnsClusterTest, DNSRefreshHasJitter) { + const std::string config = R"EOF( + name: name + type: LOGICAL_DNS + dns_refresh_rate: 4s + dns_jitter: + seconds: 0 + nanos: 512000000 + connect_timeout: 0.25s + lb_policy: ROUND_ROBIN + # Since the following expectResolve() requires Network::DnsLookupFamily::V4Only we need to set + # dns_lookup_family to V4_ONLY explicitly for v2 .yaml config. + dns_lookup_family: V4_ONLY + wait_for_warm_on_init: false + load_assignment: + endpoints: + - lb_endpoints: + - endpoint: + address: + socket_address: + address: foo.bar.com + port_value: 443 + )EOF"; + + uint64_t random_return = 8000; + uint64_t jitter_ms = random_return % 512; // default value + + EXPECT_CALL(initialized_, ready()); + expectResolve(Network::DnsLookupFamily::V4Only, "foo.bar.com"); + setupFromV3Yaml(config); + + EXPECT_CALL(membership_updated_, ready()); + EXPECT_CALL(*resolve_timer_, enableTimer(std::chrono::milliseconds(4000 + jitter_ms), _)); + ON_CALL(random_, random()).WillByDefault(Return(random_return)); + + dns_callback_( + Network::DnsResolver::ResolutionStatus::Success, "", + TestUtility::makeDnsResponse({"127.0.0.1", "127.0.0.2"}, std::chrono::seconds(3000))); +} + } // namespace } // namespace Upstream } // namespace Envoy