From 36d445a9f8d59eceed740c2b40a25663d0afff76 Mon Sep 17 00:00:00 2001
From: Snow Pettersen <snowp@lyft.com>
Date: Fri, 28 Aug 2020 20:30:04 -0400
Subject: [PATCH] fault: only check for overflow when performing faults
 (#12843)

This shuffles the check for overflow around so that we only increment
the overflow value when attempting to apply a fault instead of for
each request. This ensures that if applying a low % fault the overflow
stat will be in line with that, instead of counting each request once
the overflow threshold has been hit.

Signed-off-by: Snow Pettersen <snowp@lyft.com>
Signed-off-by: Clara Andrew-Wani <candrewwani@gmail.com>
---
 .../filters/http/fault/fault_filter.cc        | 62 +++++++++++++------
 .../filters/http/fault/fault_filter.h         |  7 ++-
 .../filters/http/fault/fault_filter_test.cc   | 47 +++++++-------
 3 files changed, 73 insertions(+), 43 deletions(-)

diff --git a/source/extensions/filters/http/fault/fault_filter.cc b/source/extensions/filters/http/fault/fault_filter.cc
index 245f44b98a56..4b98e5583a89 100644
--- a/source/extensions/filters/http/fault/fault_filter.cc
+++ b/source/extensions/filters/http/fault/fault_filter.cc
@@ -8,6 +8,7 @@
 #include "envoy/event/timer.h"
 #include "envoy/extensions/filters/http/fault/v3/fault.pb.h"
 #include "envoy/http/codes.h"
+#include "envoy/http/filter.h"
 #include "envoy/http/header_map.h"
 #include "envoy/stats/scope.h"
 
@@ -118,10 +119,6 @@ Http::FilterHeadersStatus FaultFilter::decodeHeaders(Http::RequestHeaderMap& hea
     fault_settings_ = per_route_settings ? per_route_settings : fault_settings_;
   }
 
-  if (faultOverflow()) {
-    return Http::FilterHeadersStatus::Continue;
-  }
-
   if (!matchesTargetUpstreamCluster()) {
     return Http::FilterHeadersStatus::Continue;
   }
@@ -156,27 +153,42 @@ Http::FilterHeadersStatus FaultFilter::decodeHeaders(Http::RequestHeaderMap& hea
 
   maybeSetupResponseRateLimit(headers);
 
-  absl::optional<std::chrono::milliseconds> duration = delayDuration(headers);
-  if (duration.has_value()) {
+  if (maybeSetupDelay(headers)) {
+    return Http::FilterHeadersStatus::StopIteration;
+  }
+
+  if (maybeDoAbort(headers)) {
+    return Http::FilterHeadersStatus::StopIteration;
+  }
+
+  return Http::FilterHeadersStatus::Continue;
+}
+
+bool FaultFilter::maybeSetupDelay(const Http::RequestHeaderMap& request_headers) {
+  absl::optional<std::chrono::milliseconds> duration = delayDuration(request_headers);
+  if (duration.has_value() && tryIncActiveFaults()) {
     delay_timer_ = decoder_callbacks_->dispatcher().createTimer(
-        [this, &headers]() -> void { postDelayInjection(headers); });
+        [this, &request_headers]() -> void { postDelayInjection(request_headers); });
     ENVOY_LOG(debug, "fault: delaying request {}ms", duration.value().count());
     delay_timer_->enableTimer(duration.value(), &decoder_callbacks_->scope());
     recordDelaysInjectedStats();
     decoder_callbacks_->streamInfo().setResponseFlag(StreamInfo::ResponseFlag::DelayInjected);
-    return Http::FilterHeadersStatus::StopIteration;
+    return true;
   }
+  return false;
+}
 
+bool FaultFilter::maybeDoAbort(const Http::RequestHeaderMap& request_headers) {
   absl::optional<Http::Code> http_status;
   absl::optional<Grpc::Status::GrpcStatus> grpc_status;
-  std::tie(http_status, grpc_status) = abortStatus(headers);
+  std::tie(http_status, grpc_status) = abortStatus(request_headers);
 
-  if (http_status.has_value()) {
+  if (http_status.has_value() && tryIncActiveFaults()) {
     abortWithStatus(http_status.value(), grpc_status);
-    return Http::FilterHeadersStatus::StopIteration;
+    return true;
   }
 
-  return Http::FilterHeadersStatus::Continue;
+  return false;
 }
 
 void FaultFilter::maybeSetupResponseRateLimit(const Http::RequestHeaderMap& request_headers) {
@@ -190,8 +202,10 @@ void FaultFilter::maybeSetupResponseRateLimit(const Http::RequestHeaderMap& requ
     return;
   }
 
-  // General stats. All injected faults are considered a single aggregate active fault.
-  maybeIncActiveFaults();
+  if (!tryIncActiveFaults()) {
+    return;
+  }
+
   config_->stats().response_rl_injected_.inc();
 
   response_limiter_ = std::make_unique<StreamRateLimiter>(
@@ -356,8 +370,6 @@ void FaultFilter::recordDelaysInjectedStats() {
     config_->incDelays(downstream_cluster_storage_->statName());
   }
 
-  // General stats. All injected faults are considered a single aggregate active fault.
-  maybeIncActiveFaults();
   config_->stats().delays_injected_.inc();
 }
 
@@ -367,8 +379,6 @@ void FaultFilter::recordAbortsInjectedStats() {
     config_->incAborts(downstream_cluster_storage_->statName());
   }
 
-  // General stats. All injected faults are considered a single aggregate active fault.
-  maybeIncActiveFaults();
   config_->stats().aborts_injected_.inc();
 }
 
@@ -391,15 +401,27 @@ FaultFilterStats FaultFilterConfig::generateStats(const std::string& prefix, Sta
                                  POOL_GAUGE_PREFIX(scope, final_prefix))};
 }
 
-void FaultFilter::maybeIncActiveFaults() {
+bool FaultFilter::tryIncActiveFaults() {
   // Only charge 1 active fault per filter in case we are injecting multiple faults.
+  // Since we count at most one active fault per filter, we also allow multiple faults
+  // per filter without checking for overflow.
   if (fault_active_) {
-    return;
+    return true;
+  }
+
+  // We only check for overflow when attempting to perform a fault. Note that this means that a
+  // single request might increment the counter more than once if it tries to apply multiple faults,
+  // and it is also possible for it to fail the first check then succeed on the second (should
+  // another thread decrement the active fault gauge).
+  if (faultOverflow()) {
+    return false;
   }
 
   // TODO(mattklein123): Consider per-fault type active fault gauges.
   config_->stats().active_faults_.inc();
   fault_active_ = true;
+
+  return true;
 }
 
 void FaultFilter::onDestroy() {
diff --git a/source/extensions/filters/http/fault/fault_filter.h b/source/extensions/filters/http/fault/fault_filter.h
index 206a8134c72c..e1ff4d275937 100644
--- a/source/extensions/filters/http/fault/fault_filter.h
+++ b/source/extensions/filters/http/fault/fault_filter.h
@@ -7,6 +7,7 @@
 
 #include "envoy/extensions/filters/http/fault/v3/fault.pb.h"
 #include "envoy/http/filter.h"
+#include "envoy/http/header_map.h"
 #include "envoy/runtime/runtime.h"
 #include "envoy/stats/scope.h"
 #include "envoy/stats/stats_macros.h"
@@ -262,7 +263,11 @@ class FaultFilter : public Http::StreamFilter, Logger::Loggable<Logger::Id::filt
   absl::optional<Http::Code> abortHttpStatus(const Http::RequestHeaderMap& request_headers);
   absl::optional<Grpc::Status::GrpcStatus>
   abortGrpcStatus(const Http::RequestHeaderMap& request_headers);
-  void maybeIncActiveFaults();
+  // Attempts to increase the number of active faults. Returns false if we've reached the maximum
+  // number of allowed faults, in which case no fault should be performed.
+  bool tryIncActiveFaults();
+  bool maybeDoAbort(const Http::RequestHeaderMap& request_headers);
+  bool maybeSetupDelay(const Http::RequestHeaderMap& request_headers);
   void maybeSetupResponseRateLimit(const Http::RequestHeaderMap& request_headers);
 
   FaultFilterConfigSharedPtr config_;
diff --git a/test/extensions/filters/http/fault/fault_filter_test.cc b/test/extensions/filters/http/fault/fault_filter_test.cc
index a29c5cc85816..0e25b9acb7de 100644
--- a/test/extensions/filters/http/fault/fault_filter_test.cc
+++ b/test/extensions/filters/http/fault/fault_filter_test.cc
@@ -1,5 +1,6 @@
 #include <chrono>
 #include <cstdint>
+#include <limits>
 #include <memory>
 #include <string>
 
@@ -482,10 +483,6 @@ TEST_F(FaultFilterTest, HeaderAbortWithHttpAndGrpcStatus) {
 TEST_F(FaultFilterTest, FixedDelayZeroDuration) {
   SetUpTest(fixed_delay_only_yaml);
 
-  EXPECT_CALL(runtime_.snapshot_,
-              getInteger("fault.http.max_active_faults", std::numeric_limits<uint64_t>::max()))
-      .WillOnce(Return(std::numeric_limits<uint64_t>::max()));
-
   // Delay related calls
   EXPECT_CALL(runtime_.snapshot_,
               featureEnabled("fault.http.delay.fixed_delay_percent",
@@ -511,18 +508,38 @@ TEST_F(FaultFilterTest, FixedDelayZeroDuration) {
 }
 
 TEST_F(FaultFilterTest, Overflow) {
-  envoy::extensions::filters::http::fault::v3::HTTPFault fault;
-  fault.mutable_max_active_faults()->set_value(0);
-  SetUpTest(fault);
+  SetUpTest(fixed_delay_only_yaml);
+
+  // Delay related calls
+  EXPECT_CALL(runtime_.snapshot_,
+              featureEnabled("fault.http.delay.fixed_delay_percent",
+                             Matcher<const envoy::type::v3::FractionalPercent&>(Percent(100))))
+      .WillOnce(Return(true));
+
+  // Return 1ms delay
+  EXPECT_CALL(runtime_.snapshot_, getInteger("fault.http.delay.fixed_duration_ms", 5000))
+      .WillOnce(Return(1));
 
-  EXPECT_CALL(runtime_.snapshot_, getInteger("fault.http.max_active_faults", 0))
+  EXPECT_CALL(runtime_.snapshot_,
+              getInteger("fault.http.max_active_faults", std::numeric_limits<uint64_t>::max()))
       .WillOnce(Return(0));
 
   EXPECT_EQ(Http::FilterHeadersStatus::Continue, filter_->decodeHeaders(request_headers_, true));
 
+  EXPECT_EQ(0UL, config_->stats().active_faults_.value());
   EXPECT_EQ(1UL, config_->stats().faults_overflow_.value());
 }
 
+// Verifies that we don't increment the active_faults gauge when not applying a fault.
+TEST_F(FaultFilterTest, Passthrough) {
+  envoy::extensions::filters::http::fault::v3::HTTPFault fault;
+  SetUpTest(fault);
+
+  EXPECT_EQ(Http::FilterHeadersStatus::Continue, filter_->decodeHeaders(request_headers_, true));
+
+  EXPECT_EQ(0UL, config_->stats().active_faults_.value());
+}
+
 TEST_F(FaultFilterTest, FixedDelayDeprecatedPercentAndNonZeroDuration) {
   envoy::extensions::filters::http::fault::v3::HTTPFault fault;
   fault.mutable_delay()->mutable_percentage()->set_numerator(50);
@@ -790,10 +807,6 @@ TEST_F(FaultFilterTest, FixedDelayAndAbortDownstreamNodes) {
 TEST_F(FaultFilterTest, NoDownstreamMatch) {
   SetUpTest(fixed_delay_and_abort_nodes_yaml);
 
-  EXPECT_CALL(runtime_.snapshot_,
-              getInteger("fault.http.max_active_faults", std::numeric_limits<uint64_t>::max()))
-      .WillOnce(Return(std::numeric_limits<uint64_t>::max()));
-
   EXPECT_EQ(Http::FilterHeadersStatus::Continue, filter_->decodeHeaders(request_headers_, true));
 }
 
@@ -857,10 +870,6 @@ TEST_F(FaultFilterTest, FixedDelayAndAbortHeaderMatchFail) {
   request_headers_.addCopy("x-foo1", "Bar");
   request_headers_.addCopy("x-foo3", "Baz");
 
-  EXPECT_CALL(runtime_.snapshot_,
-              getInteger("fault.http.max_active_faults", std::numeric_limits<uint64_t>::max()))
-      .WillOnce(Return(std::numeric_limits<uint64_t>::max()));
-
   EXPECT_CALL(runtime_.snapshot_,
               featureEnabled("fault.http.delay.fixed_delay_percent",
                              Matcher<const envoy::type::v3::FractionalPercent&>(_)))
@@ -984,9 +993,6 @@ TEST_F(FaultFilterTest, FaultWithTargetClusterMatchFail) {
 
   EXPECT_CALL(decoder_filter_callbacks_.route_->route_entry_, clusterName())
       .WillOnce(ReturnRef(upstream_cluster));
-  EXPECT_CALL(runtime_.snapshot_,
-              getInteger("fault.http.max_active_faults", std::numeric_limits<uint64_t>::max()))
-      .WillOnce(Return(std::numeric_limits<uint64_t>::max()));
   EXPECT_CALL(runtime_.snapshot_,
               featureEnabled("fault.http.delay.fixed_delay_percent",
                              Matcher<const envoy::type::v3::FractionalPercent&>(_)))
@@ -1014,9 +1020,6 @@ TEST_F(FaultFilterTest, FaultWithTargetClusterNullRoute) {
   const std::string upstream_cluster("www1");
 
   EXPECT_CALL(*decoder_filter_callbacks_.route_, routeEntry()).WillRepeatedly(Return(nullptr));
-  EXPECT_CALL(runtime_.snapshot_,
-              getInteger("fault.http.max_active_faults", std::numeric_limits<uint64_t>::max()))
-      .WillOnce(Return(std::numeric_limits<uint64_t>::max()));
   EXPECT_CALL(runtime_.snapshot_,
               featureEnabled("fault.http.delay.fixed_delay_percent",
                              Matcher<const envoy::type::v3::FractionalPercent&>(_)))