Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add jitter to strict and logical dns clusters #35745

Merged
merged 14 commits into from
Aug 25, 2024
19 changes: 11 additions & 8 deletions api/envoy/config/cluster/v3/cluster.proto
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ message ClusterCollection {
}

// Configuration for a single upstream cluster.
// [#next-free-field: 58]
// [#next-free-field: 59]
message Cluster {
option (udpa.annotations.versioning).previous_message_type = "envoy.api.v2.Cluster";

Expand Down Expand Up @@ -944,17 +944,20 @@ message Cluster {
// [#next-major-version: make this a list of typed extensions.]
map<string, google.protobuf.Any> typed_extension_protocol_options = 36;

// If the DNS refresh rate is specified and the cluster type is either
google.protobuf.Duration dns_refresh_rate = 16
Stevenjin8 marked this conversation as resolved.
Show resolved Hide resolved
[(validate.rules).duration = {gt {nanos: 1000000}}];

// If the DNS jitter is specified the cluster type is either
Stevenjin8 marked this conversation as resolved.
Show resolved Hide resolved
// :ref:`STRICT_DNS<envoy_v3_api_enum_value_config.cluster.v3.Cluster.DiscoveryType.STRICT_DNS>`,
// or :ref:`LOGICAL_DNS<envoy_v3_api_enum_value_config.cluster.v3.Cluster.DiscoveryType.LOGICAL_DNS>`,
// this value is used as the cluster’s DNS refresh
// rate. The value configured must be at least 1ms. If this setting is not specified, the
// value defaults to 5000ms. For cluster types other than
// or :ref:`LOGICAL_DNS<envoy_v3_api_enum_value_config.cluster.v3.Cluster.DiscoveryType.LOGICAL_DNS>`.
// DNS jitter causes the cluster to refresh DNS entries early by a round amount
// to avoid a stampede of DNS requests. This value sets the upper bound for the random amount (exclusive).
// Setting this value to 1ms will disable jitter.
Stevenjin8 marked this conversation as resolved.
Show resolved Hide resolved
// rate. If this setting is not specified, the value defaults to 512ms. For cluster types other than
// :ref:`STRICT_DNS<envoy_v3_api_enum_value_config.cluster.v3.Cluster.DiscoveryType.STRICT_DNS>`
// and :ref:`LOGICAL_DNS<envoy_v3_api_enum_value_config.cluster.v3.Cluster.DiscoveryType.LOGICAL_DNS>`
// this setting is ignored.
google.protobuf.Duration dns_refresh_rate = 16
[(validate.rules).duration = {gt {nanos: 1000000}}];
google.protobuf.Duration dns_jitter_max = 58;
Stevenjin8 marked this conversation as resolved.
Show resolved Hide resolved

// If the DNS failure refresh rate is specified and the cluster type is either
// :ref:`STRICT_DNS<envoy_v3_api_enum_value_config.cluster.v3.Cluster.DiscoveryType.STRICT_DNS>`,
Expand Down
13 changes: 13 additions & 0 deletions source/extensions/clusters/logical_dns/logical_dns_cluster.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ LogicalDnsCluster::LogicalDnsCluster(const envoy::config::cluster::v3::Cluster&
: ClusterImplBase(cluster, context, creation_status), dns_resolver_(dns_resolver),
dns_refresh_rate_ms_(
std::chrono::milliseconds(PROTOBUF_GET_MS_OR_DEFAULT(cluster, dns_refresh_rate, 5000))),
dns_jitter_max_ms_(
std::chrono::milliseconds(PROTOBUF_GET_MS_OR_DEFAULT(cluster, dns_jitter_max, 512))),
respect_dns_ttl_(cluster.respect_dns_ttl()),
resolve_timer_(context.serverFactoryContext().mainThreadDispatcher().createTimer(
[this]() -> void { startResolve(); })),
Expand Down Expand Up @@ -101,7 +103,14 @@ void LogicalDnsCluster::startResolve() {
active_dns_query_ = nullptr;
ENVOY_LOG(trace, "async DNS resolution complete for {} details {}", dns_address_, details);

std::chrono::milliseconds jitter(0);
if (dns_jitter_max_ms_.count() != 0) {
jitter = std::chrono::milliseconds(random_.random()) % dns_jitter_max_ms_;
}
std::chrono::milliseconds final_refresh_rate = dns_refresh_rate_ms_;
if (jitter < final_refresh_rate) {
final_refresh_rate -= jitter;
Stevenjin8 marked this conversation as resolved.
Show resolved Hide resolved
}

// If the DNS resolver successfully resolved with an empty response list, the logical DNS
// cluster does not update. This ensures that a potentially previously resolved address does
Expand Down Expand Up @@ -148,7 +157,11 @@ void LogicalDnsCluster::startResolve() {

if (respect_dns_ttl_ && addrinfo.ttl_ != std::chrono::seconds(0)) {
final_refresh_rate = addrinfo.ttl_;
if (jitter < final_refresh_rate) {
final_refresh_rate -= jitter;
}
}

ENVOY_LOG(debug, "DNS refresh rate reset for {}, refresh rate {} ms", dns_address_,
final_refresh_rate.count());
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ class LogicalDnsCluster : public ClusterImplBase {

Network::DnsResolverSharedPtr dns_resolver_;
const std::chrono::milliseconds dns_refresh_rate_ms_;
const std::chrono::milliseconds dns_jitter_max_ms_;
BackOffStrategyPtr failure_backoff_strategy_;
const bool respect_dns_ttl_;
Network::DnsLookupFamily dns_lookup_family_;
Expand Down
14 changes: 14 additions & 0 deletions source/extensions/clusters/strict_dns/strict_dns_cluster.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include "source/extensions/clusters/strict_dns/strict_dns_cluster.h"

#include <chrono>

#include "envoy/common/exception.h"
#include "envoy/config/cluster/v3/cluster.pb.h"
#include "envoy/config/endpoint/v3/endpoint.pb.h"
Expand Down Expand Up @@ -29,6 +31,7 @@ StrictDnsClusterImpl::StrictDnsClusterImpl(const envoy::config::cluster::v3::Clu
local_info_(context.serverFactoryContext().localInfo()), dns_resolver_(dns_resolver),
dns_refresh_rate_ms_(
std::chrono::milliseconds(PROTOBUF_GET_MS_OR_DEFAULT(cluster, dns_refresh_rate, 5000))),
dns_jitter_max_ms_(PROTOBUF_GET_MS_OR_DEFAULT(cluster, dns_jitter_max, 512)),
respect_dns_ttl_(cluster.respect_dns_ttl()) {
failure_backoff_strategy_ =
Config::Utility::prepareDnsRefreshStrategy<envoy::config::cluster::v3::Cluster>(
Expand Down Expand Up @@ -125,7 +128,15 @@ void StrictDnsClusterImpl::ResolveTarget::startResolve() {
active_query_ = nullptr;
ENVOY_LOG(trace, "async DNS resolution complete for {} details {}", dns_address_, details);

std::chrono::milliseconds jitter(0);
if (parent_.dns_jitter_max_ms_.count() != 0) {
jitter = std::chrono::milliseconds(parent_.random_.random()) % parent_.dns_jitter_max_ms_;
}

std::chrono::milliseconds final_refresh_rate = parent_.dns_refresh_rate_ms_;
if (jitter < final_refresh_rate) {
final_refresh_rate -= jitter;
}

if (status == Network::DnsResolver::ResolutionStatus::Success) {
parent_.info_->configUpdateStats().update_success_.inc();
Expand Down Expand Up @@ -186,6 +197,9 @@ void StrictDnsClusterImpl::ResolveTarget::startResolve() {
if (!response.empty() && parent_.respect_dns_ttl_ &&
ttl_refresh_rate != std::chrono::seconds(0)) {
final_refresh_rate = ttl_refresh_rate;
if (jitter < final_refresh_rate) {
final_refresh_rate -= jitter;
}
ASSERT(ttl_refresh_rate != std::chrono::seconds::max() &&
final_refresh_rate.count() > 0);
}
Expand Down
1 change: 1 addition & 0 deletions source/extensions/clusters/strict_dns/strict_dns_cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ class StrictDnsClusterImpl : public BaseDynamicClusterImpl {
Network::DnsResolverSharedPtr dns_resolver_;
std::list<ResolveTargetPtr> resolve_targets_;
const std::chrono::milliseconds dns_refresh_rate_ms_;
const std::chrono::milliseconds dns_jitter_max_ms_;
BackOffStrategyPtr failure_backoff_strategy_;
const bool respect_dns_ttl_;
Network::DnsLookupFamily dns_lookup_family_;
Expand Down
48 changes: 46 additions & 2 deletions test/common/upstream/upstream_impl_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1391,6 +1391,7 @@ TEST_F(StrictDnsClusterImplTest, FailureRefreshRateBackoffResetsWhenSuccessHappe
type: STRICT_DNS
lb_policy: ROUND_ROBIN
dns_refresh_rate: 4s
dns_jitter_max: 0s
dns_failure_refresh_rate:
base_interval: 7s
max_interval: 10s
Expand Down Expand Up @@ -1432,7 +1433,7 @@ TEST_F(StrictDnsClusterImplTest, FailureRefreshRateBackoffResetsWhenSuccessHappe
TestUtility::makeDnsResponse({}));
}

TEST_F(StrictDnsClusterImplTest, TtlAsDnsRefreshRate) {
TEST_F(StrictDnsClusterImplTest, TtlAsDnsRefreshRateNoJitter) {
ResolverData resolver(*dns_resolver_, server_context_.dispatcher_);

const std::string yaml = R"EOF(
Expand All @@ -1442,6 +1443,7 @@ TEST_F(StrictDnsClusterImplTest, TtlAsDnsRefreshRate) {
lb_policy: ROUND_ROBIN
dns_refresh_rate: 4s
respect_dns_ttl: true
dns_jitter_max: 0s
load_assignment:
endpoints:
- lb_endpoints:
Expand Down Expand Up @@ -1488,10 +1490,52 @@ TEST_F(StrictDnsClusterImplTest, TtlAsDnsRefreshRate) {
TestUtility::makeDnsResponse({}, std::chrono::seconds(5)));
}

TEST_F(StrictDnsClusterImplTest, TtlAsDnsRefreshRateYesJitter) {
ResolverData resolver(*dns_resolver_, server_context_.dispatcher_);

// jitter defaults to 512ms
const std::string yaml = R"EOF(
name: name
connect_timeout: 0.25s
type: STRICT_DNS
lb_policy: ROUND_ROBIN
dns_refresh_rate: 4s
respect_dns_ttl: true
load_assignment:
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: localhost1
port_value: 11001
)EOF";

envoy::config::cluster::v3::Cluster cluster_config = parseClusterFromV3Yaml(yaml);

Envoy::Upstream::ClusterFactoryContextImpl factory_context(
server_context_, server_context_.cluster_manager_, nullptr, ssl_context_manager_, nullptr,
false);

auto cluster = *StrictDnsClusterImpl::create(cluster_config, factory_context, dns_resolver_);

cluster->initialize([] {});

uint64_t random_return = 8000;
uint64_t jitter_ms = random_return % 512;
uint64_t ttl_s = 6;

EXPECT_CALL(*resolver.timer_,
enableTimer(std::chrono::milliseconds(ttl_s * 1000 - jitter_ms), _));
ON_CALL(random_, random()).WillByDefault(Return(random_return));
resolver.dns_callback_(
Network::DnsResolver::ResolutionStatus::Success, "",
TestUtility::makeDnsResponse({"192.168.1.1", "192.168.1.2"}, std::chrono::seconds(ttl_s)));
}

// Ensures that HTTP/2 user defined SETTINGS parameter validation is enforced on clusters.
TEST_F(StrictDnsClusterImplTest, Http2UserDefinedSettingsParametersValidation) {
const std::string yaml = R"EOF(
name: name
connect_timeout: 0.25s
type: strict_dns
dns_refresh_rate: 4s
Expand Down
1 change: 1 addition & 0 deletions test/extensions/clusters/logical_dns/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ envoy_cc_test(
name = "logical_dns_cluster_test",
srcs = ["logical_dns_cluster_test.cc"],
deps = [
"//source/common/common:random_generator_lib",
"//source/common/event:dispatcher_lib",
"//source/common/network:utility_lib",
"//source/common/upstream:upstream_lib",
Expand Down
52 changes: 52 additions & 0 deletions test/extensions/clusters/logical_dns/logical_dns_cluster_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,7 @@ TEST_F(LogicalDnsParamTest, FailureRefreshRateBackoffResetsWhenSuccessHappens) {
name: name
type: LOGICAL_DNS
dns_refresh_rate: 4s
dns_jitter_max: 0s
dns_failure_refresh_rate:
base_interval: 7s
max_interval: 10s
Expand Down Expand Up @@ -525,6 +526,10 @@ TEST_F(LogicalDnsClusterTest, Basic) {
name: name
type: LOGICAL_DNS
dns_refresh_rate: 4s
# disable jitter
dns_jitter_max:
seconds: 0
nanos: 1000000
dns_failure_refresh_rate:
base_interval: 7s
max_interval: 10s
Expand All @@ -547,6 +552,10 @@ TEST_F(LogicalDnsClusterTest, Basic) {
name: name
type: LOGICAL_DNS
dns_refresh_rate: 4s
# disable jitter
dns_jitter_max:
seconds: 0
nanos: 1000000
dns_failure_refresh_rate:
base_interval: 7s
max_interval: 10s
Expand Down Expand Up @@ -581,6 +590,10 @@ TEST_F(LogicalDnsClusterTest, DontWaitForDNSOnInit) {
dns_failure_refresh_rate:
base_interval: 7s
max_interval: 10s
# disable jitter
dns_jitter_max:
seconds: 0
nanos: 1000000
connect_timeout: 0.25s
lb_policy: ROUND_ROBIN
# Since the following expectResolve() requires Network::DnsLookupFamily::V4Only we need to set
Expand All @@ -602,11 +615,50 @@ TEST_F(LogicalDnsClusterTest, DontWaitForDNSOnInit) {
setupFromV3Yaml(config);

EXPECT_CALL(membership_updated_, ready());

EXPECT_CALL(*resolve_timer_, enableTimer(std::chrono::milliseconds(4000), _));
dns_callback_(Network::DnsResolver::ResolutionStatus::Success, "",
TestUtility::makeDnsResponse({"127.0.0.1", "127.0.0.2"}));
}

TEST_F(LogicalDnsClusterTest, DNSRefreshHasJitter) {
const std::string config = R"EOF(
name: name
type: LOGICAL_DNS
dns_refresh_rate: 4s
connect_timeout: 0.25s
lb_policy: ROUND_ROBIN
# Since the following expectResolve() requires Network::DnsLookupFamily::V4Only we need to set
# dns_lookup_family to V4_ONLY explicitly for v2 .yaml config.
dns_lookup_family: V4_ONLY
wait_for_warm_on_init: false
load_assignment:
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: foo.bar.com
port_value: 443
)EOF";

uint64_t random_return = 8000;
uint64_t jitter_ms = random_return % 512; // default value

// We don't set `respect_dns_ttl`, so we use `dns_refresh_rate` instead of the ttl.
EXPECT_CALL(initialized_, ready());
expectResolve(Network::DnsLookupFamily::V4Only, "foo.bar.com");
setupFromV3Yaml(config);

EXPECT_CALL(membership_updated_, ready());
EXPECT_CALL(*resolve_timer_, enableTimer(std::chrono::milliseconds(4000 - jitter_ms), _));
ON_CALL(random_, random()).WillByDefault(Return(random_return));

dns_callback_(
Network::DnsResolver::ResolutionStatus::Success, "",
TestUtility::makeDnsResponse({"127.0.0.1", "127.0.0.2"}, std::chrono::seconds(3000)));
}

} // namespace
} // namespace Upstream
} // namespace Envoy