Skip to content

Commit

Permalink
Fix potential hangs on exit & cap drain wait time (#628)
Browse files Browse the repository at this point in the history
- don't set up a drain callback when there are no active connections
- set up a timer that will cap the amount of time we wait for the pool
  to drain.
- disable latency measurement when commencing the drain procedure, as
  by that time we are no longer interested in it, and in particular
  don't want to hear about any warnings issued by the Statistic
  implementation about recorded values being too large. 

Fixes #627

Signed-off-by: Otto van der Schaaf <[email protected]>
  • Loading branch information
Otto van der Schaaf authored Feb 15, 2021
1 parent ff42695 commit 15b86be
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 5 deletions.
2 changes: 1 addition & 1 deletion ci/do_ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ function do_clang_tidy() {

function do_unit_test_coverage() {
export TEST_TARGETS="//test/... -//test:python_test"
export COVERAGE_THRESHOLD=94.0
export COVERAGE_THRESHOLD=94.3
echo "bazel coverage build with tests ${TEST_TARGETS}"
test/run_nighthawk_bazel_coverage.sh ${TEST_TARGETS}
exit 0
Expand Down
20 changes: 17 additions & 3 deletions source/client/benchmark_client_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,23 @@ BenchmarkClientHttpImpl::BenchmarkClientHttpImpl(
}

void BenchmarkClientHttpImpl::terminate() {
if (pool() != nullptr) {
pool()->addDrainedCallback([this]() -> void { dispatcher_.exit(); });
pool()->drainConnections();
if (pool() != nullptr && pool()->hasActiveConnections()) {
// We don't report what happens after this call in the output, but latencies may still be
// reported via callbacks. This may happen after a long time (60s), which HdrHistogram can't
// track the way we configure it today, as that exceeds the max that it can record.
// No harm is done, but it does result in log lines warning about it. Avoid that, by
// disabling latency measurement here.
setShouldMeasureLatencies(false);
pool()->addDrainedCallback([this]() -> void {
drain_timer_->disableTimer();
dispatcher_.exit();
});
// Set up a timer with a callback which caps the time we wait for the pool to drain.
drain_timer_ = dispatcher_.createTimer([this]() -> void {
ENVOY_LOG(info, "Wait for the connection pool drain timed out, proceeding to hard shutdown.");
dispatcher_.exit();
});
drain_timer_->enableTimer(5s);
dispatcher_.run(Envoy::Event::Dispatcher::RunType::RunUntilExit);
}
}
Expand Down
1 change: 1 addition & 0 deletions source/client/benchmark_client_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ class BenchmarkClientHttpImpl : public BenchmarkClient,
const RequestGenerator request_generator_;
const bool provide_resource_backpressure_;
const std::string latency_response_header_name_;
Envoy::Event::TimerPtr drain_timer_;
};

} // namespace Client
Expand Down
26 changes: 25 additions & 1 deletion test/benchmark_http_client_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ class BenchmarkClientHttpTest : public Test {
int worker_number_{0};
Client::BenchmarkClientStatistic statistic_;
std::shared_ptr<Envoy::Http::RequestHeaderMap> default_header_map_;
}; // namespace Nighthawk
};

TEST_F(BenchmarkClientHttpTest, BasicTestH1200) {
response_code_ = "200";
Expand Down Expand Up @@ -419,4 +419,28 @@ TEST_F(BenchmarkClientHttpTest, RequestGeneratorProvidingDifferentPathsSendsRequ
&expected_requests);
EXPECT_EQ(2, getCounter("http_2xx"));
}

TEST_F(BenchmarkClientHttpTest, DrainTimeoutFires) {
RequestGenerator default_request_generator = getDefaultRequestGenerator();
setupBenchmarkClient(default_request_generator);
EXPECT_CALL(pool_, newStream(_, _))
.WillOnce(
[this](Envoy::Http::ResponseDecoder& decoder, Envoy::Http::ConnectionPool::Callbacks&)
-> Envoy::Http::ConnectionPool::Cancellable* {
// The decoder self-terminates in normal operation, but in this test that won't
// happen. Se we delete it ourselves. Note that we run our integration test with
// asan, so any leaks in real usage ought to be caught there.
delete &decoder;
client_->terminate();
return nullptr;
});
EXPECT_CALL(pool_, hasActiveConnections()).WillOnce([]() -> bool { return true; });
EXPECT_CALL(pool_, addDrainedCallback(_));
// We don't expect the callback that we pass here to fire.
client_->tryStartRequest([](bool, bool) { EXPECT_TRUE(false); });
// To get past this, the drain timeout within the benchmark client must execute.
dispatcher_->run(Envoy::Event::Dispatcher::RunType::Block);
EXPECT_EQ(0, getCounter("http_2xx"));
}

} // namespace Nighthawk
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ static_resources:
domains:
- "*"
http_filters:
- name: dynamic-delay
- name: test-server
typed_config:
"@type": type.googleapis.com/nighthawk.server.ResponseOptions
Expand Down
23 changes: 23 additions & 0 deletions test/integration/test_integration_basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -794,3 +794,26 @@ def test_client_cli_bad_uri(http_test_server_fixture):
expect_failure=True,
as_json=False)
assert "Invalid target URI" in err


@pytest.mark.parametrize('server_config',
["nighthawk/test/integration/configurations/nighthawk_https_origin.yaml"])
def test_drain(https_test_server_fixture):
"""Test that the pool drain timeout is effective, and we terminate in a timely fashion.
Sets up the test server to delay replies 100 seconds. Our execution will only last 3 seconds, so we
expect to observe no replies. Termination should be cut short by the drain timeout, which means
that we should have results in approximately execution duration + drain timeout = 8 seconds.
(the pool drain timeout is hard coded to 5 seconds as of writing this).
"""
t0 = time.time()
parsed_json, _ = https_test_server_fixture.runNighthawkClient([
https_test_server_fixture.getTestServerRootUri(), "--rps", "100", "--duration", "3",
"--request-header", "x-nighthawk-test-server-config: {static_delay: \"100s\"}"
])
t1 = time.time()
time_delta = t1 - t0
counters = https_test_server_fixture.getNighthawkCounterMapFromJson(parsed_json)
assert time_delta < 40 # *lots* of slack to avoid failure in slow CI executions.
asserts.assertCounterGreaterEqual(counters, "upstream_cx_http1_total", 1)
asserts.assertNotIn("benchmark.http_2xx", counters)

0 comments on commit 15b86be

Please sign in to comment.