Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OCPBUGS-14914: Set tunnel and server timeouts at backend level #536

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ GO_LDFLAGS ?=-ldflags "-s -w $(call version-ldflags,$(PACKAGE)/pkg/version) $(GO
GO=GO111MODULE=on GOFLAGS=-mod=vendor go
GO_BUILD_RECIPE=CGO_ENABLED=1 $(GO) build -o $(BIN) $(GO_GCFLAGS) $(GO_LDFLAGS) $(MAIN_PACKAGE)

# Get all the packages with benchmark tests defined to prevent
# "go test" from entering all existing packages when "./.." is used.
BENCH_PKGS ?= $(shell \grep -lR '^func Benchmark' | xargs -I {} dirname {} | sed 's|^|./|' | paste -sd ' ')
BENCH_TEST ?= .
Comment on lines +30 to +31
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you mind adding a couple of comments about what these are for? I see below that BENCH_PKGS is a value for flag -benchmem, so it could use a clarification.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

BENCH_TIME ?= 2s
BENCH_COUNT ?= 5

all: build

build:
Expand All @@ -39,6 +46,11 @@ images/router/*/Dockerfile.rhel: images/router/base/Dockerfile.rhel
check:
CGO_ENABLED=1 $(GO) test -race ./...

.PHONY: bench
bench:
@# -run flag is added to avoid running unit tests
CGO_ENABLED=1 $(GO) test -bench=$(BENCH_TEST) -run='^#' -benchtime=$(BENCH_TIME) -count=$(BENCH_COUNT) -benchmem $(BENCH_PKGS)

.PHONY: verify
verify:
hack/verify-gofmt.sh
Expand Down
26 changes: 16 additions & 10 deletions images/router/haproxy/conf/haproxy-config.template
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
{{- $dynamicConfigManager := .DynamicConfigManager }}
{{- $router_ip_v4_v6_mode := env "ROUTER_IP_V4_V6_MODE" "v4" }}
{{- $router_disable_http2 := env "ROUTER_DISABLE_HTTP2" "false" }}
{{- $routerDefaultServerTimeout := env "ROUTER_DEFAULT_SERVER_TIMEOUT" "30s" }}
{{- $routerDefaultTunnelTimeout := env "ROUTER_DEFAULT_TUNNEL_TIMEOUT" "1h" }}
{{- $haveClientCA := .HaveClientCA }}
{{- $haveCRLs := .HaveCRLs }}

Expand Down Expand Up @@ -42,6 +44,9 @@
{{- /* pathRewriteTargetPattern: Match path rewrite-Target */}}
{{- $pathRewriteTargetPattern := `^/.*$` -}}

{{- /* Maximum timeout among all the routes, required to be set on the middle backends to avoid warning message about missing server timeout. */}}
{{- $routerMaxServerTimeout := maxTimeoutFirstMatchedAndClipped .State "haproxy.router.openshift.io/timeout" $timeSpecPattern $routerDefaultServerTimeout }}

global
# Drop resource limit checks to mitigate https://issues.redhat.com/browse/OCPBUGS-21803 in HAProxy 2.6.
no strict-limits
Expand Down Expand Up @@ -158,14 +163,10 @@ defaults
timeout connect {{ firstMatch $timeSpecPattern (env "ROUTER_DEFAULT_CONNECT_TIMEOUT") "5s" }}
timeout client {{ firstMatch $timeSpecPattern (env "ROUTER_DEFAULT_CLIENT_TIMEOUT") "30s" }}
timeout client-fin {{ firstMatch $timeSpecPattern (env "ROUTER_CLIENT_FIN_TIMEOUT") "1s" }}
timeout server {{ firstMatch $timeSpecPattern (env "ROUTER_DEFAULT_SERVER_TIMEOUT") "30s" }}
timeout server-fin {{ firstMatch $timeSpecPattern (env "ROUTER_DEFAULT_SERVER_FIN_TIMEOUT") "1s" }}
timeout http-request {{ firstMatch $timeSpecPattern (env "ROUTER_SLOWLORIS_TIMEOUT") "10s" }}
timeout http-keep-alive {{ firstMatch $timeSpecPattern (env "ROUTER_SLOWLORIS_HTTP_KEEPALIVE") "300s" }}

# Long timeout for WebSocket connections.
timeout tunnel {{ firstMatch $timeSpecPattern (env "ROUTER_DEFAULT_TUNNEL_TIMEOUT") "1h" }}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why don't you need to specify timeout tunnel on be_sni or be_no_sni? Is HAProxy more lenient about not specifying timeout tunnel than it is about not specifying timeout server?

Does omitting timeout tunnel result in an infinite timeout, similarly to omitting timeout server, or does it fall back to using the server timeout as the tunnel timeout? Crucially, if you have a route that has a tunnel timeout that is greater than any route's server timeout, is the tunnel timeout respected, or does the server timeout in be_sni/be_no_sni cause the connection to be terminated before the route's tunnel timeout?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why don't you need to specify timeout tunnel on be_sni or be_no_sni? Is HAProxy more lenient about not specifying timeout tunnel than it is about not specifying timeout server?

Yes. Only the missing server timeout gives a warning message. I'm not sure the warning reports a real problem though taking into account that the bottom backends will still have the server timeout set.

Does omitting timeout tunnel result in an infinite timeout, similarly to omitting timeout server, or does it fall back to using the server timeout as the tunnel timeout? Crucially, if you have a route that has a tunnel timeout that is greater than any route's server timeout, is the tunnel timeout respected, or does the server timeout in be_sni/be_no_sni cause the connection to be terminated before the route's tunnel timeout?

Regarding the link between server and tunnel timeouts. I discovered that the tunnel timeout supersedes server timeout in the TCP proxies. Note that the middle backends in the router configuration are TCP proxies. So setting tunnel timeout in default or the middle backends (be_sni/be_no_sni) results into interference in the server timeout behavior set on the bottom backend level. I didn't see the opposite though: the server timeout set on default or middle backends didn't influence the tunnel timeout set on the bottom backends.


{{- if isTrue (env "ROUTER_ENABLE_COMPRESSION") }}
compression algo gzip
compression type {{ env "ROUTER_COMPRESSION_MIME" "text/html text/plain text/css" }}
Expand Down Expand Up @@ -318,6 +319,7 @@ frontend public_ssl
# traffic
##########################################################################
backend be_sni
timeout server {{ $routerMaxServerTimeout }}
server fe_sni unix@/var/lib/haproxy/run/haproxy-sni.sock weight 1 send-proxy

frontend fe_sni
Expand Down Expand Up @@ -434,6 +436,7 @@ frontend fe_sni
##########################################################################
# backend for when sni does not exist, or ssl term needs to happen on the edge
backend be_no_sni
timeout server {{ $routerMaxServerTimeout }}
server fe_no_sni unix@/var/lib/haproxy/run/haproxy-no-sni.sock weight 1 send-proxy

frontend fe_no_sni
Expand Down Expand Up @@ -593,11 +596,11 @@ backend {{ genBackendNamePrefix $cfg.TLSTermination }}:{{ $cfgIdx }}
{{- end }}
tcp-request content reject if !whitelist
{{- end }}
{{- with $value := clipHAProxyTimeoutValue (firstMatch $timeSpecPattern (index $cfg.Annotations "haproxy.router.openshift.io/timeout")) }}
timeout server {{ $value }}
{{- with $value := clipHAProxyTimeoutValue (firstMatch $timeSpecPattern (index $cfg.Annotations "haproxy.router.openshift.io/timeout") $routerDefaultServerTimeout) }}
timeout server {{ $value }}
{{- end }}
{{- with $value := clipHAProxyTimeoutValue (firstMatch $timeSpecPattern (index $cfg.Annotations "haproxy.router.openshift.io/timeout-tunnel")) }}
timeout tunnel {{ $value }}
{{- with $value := clipHAProxyTimeoutValue (firstMatch $timeSpecPattern (index $cfg.Annotations "haproxy.router.openshift.io/timeout-tunnel") $routerDefaultTunnelTimeout) }}
timeout tunnel {{ $value }}
{{- end }}

{{- if isTrue (index $cfg.Annotations "haproxy.router.openshift.io/rate-limit-connections") }}
Expand Down Expand Up @@ -797,8 +800,11 @@ backend {{ genBackendNamePrefix $cfg.TLSTermination }}:{{ $cfgIdx }}
{{- end }}
tcp-request content reject if !whitelist
{{- end }}
{{- with $value := clipHAProxyTimeoutValue (firstMatch $timeSpecPattern (index $cfg.Annotations "haproxy.router.openshift.io/timeout-tunnel") (index $cfg.Annotations "haproxy.router.openshift.io/timeout")) }}
timeout tunnel {{ $value }}
{{- with $value := clipHAProxyTimeoutValue (firstMatch $timeSpecPattern (index $cfg.Annotations "haproxy.router.openshift.io/timeout") $routerDefaultServerTimeout) }}
timeout server {{ $value }}
{{- end }}
{{- with $value := clipHAProxyTimeoutValue (firstMatch $timeSpecPattern (index $cfg.Annotations "haproxy.router.openshift.io/timeout-tunnel") (index $cfg.Annotations "haproxy.router.openshift.io/timeout") $routerDefaultTunnelTimeout) }}
timeout tunnel {{ $value }}
{{- end }}

{{- if isTrue (index $cfg.Annotations "haproxy.router.openshift.io/rate-limit-connections") }}
Expand Down
35 changes: 35 additions & 0 deletions pkg/router/template/template_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"strings"
"sync"
"text/template"
"time"

routev1 "github.com/openshift/api/route/v1"
"github.com/openshift/router/pkg/router/routeapihelpers"
Expand Down Expand Up @@ -389,6 +390,38 @@ func parseIPList(list string) string {
return list
}

// maxTimeoutFirstMatchedAndClipped finds the maximum timeout managed by a given annotation among all the routes, matches it against a given pattern, and clips.
// The goal is to get the maximum timeout among the ones set on the lowest layer backends, rather than the maximum of all values provided to the function.
// For instance, if a route has a timeout annotation set, it will take precedence over the default timeout, even if the default timeout is greater.
func maxTimeoutFirstMatchedAndClipped(aliases map[ServiceAliasConfigKey]ServiceAliasConfig, annotation, pattern string, values ...string) string {
var (
max string
maxDuration time.Duration
)
// find max timeout in route annotations
for _, cfg := range aliases {
timeout := clipHAProxyTimeoutValue(firstMatch(pattern, cfg.Annotations[annotation]))
if timeout != "" {
// No error handling because clipHAProxyTimeoutValue returns
// a valid timeout or an empty string. The latter is already handled.
timeoutDuration, _ := time.ParseDuration(timeout)
if timeoutDuration > maxDuration {
max = timeout
maxDuration = timeoutDuration
}
}
}
// use values if no max was found in routes
if max == "" {
max = clipHAProxyTimeoutValue(firstMatch(pattern, values...))
}
// use max haproxy timeout if no max was found
if max == "" {
max = templateutil.HaproxyMaxTimeout
}
return max
}

var helperFunctions = template.FuncMap{
"endpointsForAlias": endpointsForAlias, //returns the list of valid endpoints
"processEndpointsForAlias": processEndpointsForAlias, //returns the list of valid endpoints after processing them
Expand Down Expand Up @@ -416,4 +449,6 @@ var helperFunctions = template.FuncMap{
"parseIPList": parseIPList, //parses the list of IPs/CIDRs (IPv4/IPv6)

"processRewriteTarget": rewritetarget.SanitizeInput, //sanitizes `haproxy.router.openshift.io/rewrite-target` annotation

"maxTimeoutFirstMatchedAndClipped": maxTimeoutFirstMatchedAndClipped, //finds the maximum timeout managed by a given annotation among all the routes, matches it against a given pattern, and clips
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Keep in mind that this list of helper functions is effectively an API. We cannot rename or remove a helper function without potentially breaking third-party templates, so we tend to prefer defining general and composable helpers. That said, I think it is worth putting this functionality in a single specialized helper to avoid excessive performance impact.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is worth putting this functionality in a single specialized helper to avoid excessive performance impact.

I'm not sure I understand how putting the function into a different helper supposed to remove the API contract. Taking into account that the function will still be used in the config template.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My point was that there is a tradeoff here, and we should be aware of it.

  • Pro: Simpler, more efficient template.
  • Con: Risk of breaking unsupported customizations if we ever change or remove the helper.

However, it is probably worth making that tradeoff here. I'm not asking you to change anything; I'm just trying to be deliberate and explicit about the design choice.

}
210 changes: 210 additions & 0 deletions pkg/router/template/template_helper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1067,3 +1067,213 @@ func TestParseIPList(t *testing.T) {
})
}
}

func Test_maxTimeoutFirstMatchedAndClipped(t *testing.T) {
testCases := []struct {
name string
state map[ServiceAliasConfigKey]ServiceAliasConfig
annotation string
pattern string
Comment on lines +1075 to +1076
Copy link
Contributor

@Miciah Miciah Feb 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add test cases with different values for annotation and pattern?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had a test case with 2 routes one of which didn't match the pattern (One route timeout doesn't match pattern). I added another one when no route matched the pattern (No route timeout matches pattern). Let me know if this doesn't addresses your comment.

values []string
expected string
}{
{
name: "Route timeout is maximum",
state: map[ServiceAliasConfigKey]ServiceAliasConfig{
"test:route1": {
Annotations: map[string]string{
"haproxy.router.openshift.io/timeout": "5m",
},
},
"test:route2": {
Annotations: map[string]string{
"haproxy.router.openshift.io/timeout": "35s",
},
},
"test:route3": {
Annotations: map[string]string{
"haproxy.router.openshift.io/timeout-tunnel": "10m",
},
},
"test:route4": {},
},
annotation: "haproxy.router.openshift.io/timeout",
pattern: `[1-9][0-9]*(us|ms|s|m|h|d)?`,
values: []string{"30s"},
expected: "5m",
},
{
name: "Default timeout is maximum",
state: map[ServiceAliasConfigKey]ServiceAliasConfig{
"test:route1": {},
"test:route2": {},
"test:route3": {},
},
annotation: "haproxy.router.openshift.io/timeout",
pattern: `[1-9][0-9]*(us|ms|s|m|h|d)?`,
values: []string{"30s"},
expected: "30s",
},
Comment on lines +1106 to +1116
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should at least one of the routes specify a value? It makes sense to me to have a test case for "some routes specify timeouts, but the default timeout is maximum" .

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By the way, what happens if you have 0 routes?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should at least one of the routes specify a value? It makes sense to me to have a test case for "some routes specify timeouts, but the default timeout is maximum" .

The goal of maxTimeoutFirstMatchedAndClipped is to get the maximum timeout among the ones set on the lowest layer backends, rather than the maximum of all values provided to the function. For instance, if a route has a timeout annotation set, it will take precedence over the default timeout, even if the default timeout is greater. I updated the godoc for the function to highlight this.

By the way, what happens if you have 0 routes?

Unit test case added to cover this case (Empty state).

{
name: "First default timeout is choosen",
state: map[ServiceAliasConfigKey]ServiceAliasConfig{
"test:route1": {},
"test:route2": {},
"test:route3": {},
},
annotation: "haproxy.router.openshift.io/timeout",
pattern: `[1-9][0-9]*(us|ms|s|m|h|d)?`,
values: []string{"30s", "40s"},
expected: "30s",
},
{
name: "One route timeout doesn't match pattern",
state: map[ServiceAliasConfigKey]ServiceAliasConfig{
"test:route1": {
Annotations: map[string]string{
"haproxy.router.openshift.io/timeout": "5minutes",
},
},
"test:route2": {
Annotations: map[string]string{
"haproxy.router.openshift.io/timeout": "35s",
},
},
},
annotation: "haproxy.router.openshift.io/timeout",
pattern: `[1-9][0-9]*(us|ms|s|m|h|d)?`,
values: []string{"30s"},
expected: "35s",
},
{
name: "No route timeout matches pattern",
state: map[ServiceAliasConfigKey]ServiceAliasConfig{
"test:route1": {
Annotations: map[string]string{
"haproxy.router.openshift.io/timeout": "5minutes",
},
},
"test:route2": {
Annotations: map[string]string{
"haproxy.router.openshift.io/timeout": "35seconds",
},
},
},
annotation: "haproxy.router.openshift.io/timeout",
pattern: `[1-9][0-9]*(us|ms|s|m|h|d)?`,
values: []string{"30s"},
expected: "30s",
},
{
name: "Route timeout clipped",
state: map[ServiceAliasConfigKey]ServiceAliasConfig{
"test:route1": {
Annotations: map[string]string{
"haproxy.router.openshift.io/timeout": "999999999s",
},
},
},
annotation: "haproxy.router.openshift.io/timeout",
pattern: `[1-9][0-9]*(us|ms|s|m|h|d)?`,
values: []string{"30s"},
expected: "2147483647ms",
},
alebedev87 marked this conversation as resolved.
Show resolved Hide resolved
{
name: "Default timeout overflows but route takes precedence",
state: map[ServiceAliasConfigKey]ServiceAliasConfig{
"test:route1": {
Annotations: map[string]string{
"haproxy.router.openshift.io/timeout": "30s",
},
},
},
annotation: "haproxy.router.openshift.io/timeout",
pattern: `[1-9][0-9]*(us|ms|s|m|h|d)?`,
values: []string{"999999999s"},
expected: "30s",
},
{
name: "Empty state",
annotation: "haproxy.router.openshift.io/timeout",
pattern: `[1-9][0-9]*(us|ms|s|m|h|d)?`,
values: []string{"30s"},
expected: "30s",
},
{
name: "Empty state and values",
annotation: "haproxy.router.openshift.io/timeout",
pattern: `[1-9][0-9]*(us|ms|s|m|h|d)?`,
expected: "2147483647ms",
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
got := maxTimeoutFirstMatchedAndClipped(tc.state, tc.annotation, tc.pattern, tc.values...)
if got != tc.expected {
t.Errorf("Failure: expected %q, got %q", tc.expected, got)
}
})
}
}

func Benchmark_maxTimeoutFirstMatchedAndClipped(b *testing.B) {
testCases := []struct {
name string
state map[ServiceAliasConfigKey]ServiceAliasConfig
annotation string
pattern string
values []string
}{
{
name: "Input1",
state: map[ServiceAliasConfigKey]ServiceAliasConfig{
"test:route1": {
Annotations: map[string]string{
"haproxy.router.openshift.io/timeout": "5m",
},
},
},
annotation: "haproxy.router.openshift.io/timeout",
pattern: `[1-9][0-9]*(us|ms|s|m|h|d)?`,
values: []string{"30s"},
},
{
name: "Input5",
state: map[ServiceAliasConfigKey]ServiceAliasConfig{
"test:route1": {
Annotations: map[string]string{
"haproxy.router.openshift.io/timeout": "5m",
},
},
"test:route2": {
Annotations: map[string]string{
"haproxy.router.openshift.io/timeout": "35s",
},
},
"test:route3": {
Annotations: map[string]string{
"haproxy.router.openshift.io/timeout-tunnel": "35s",
},
},
"test:route4": {
Annotations: map[string]string{
"haproxy.router.openshift.io/timeout-tunnel": "10m",
},
},
"test:route5": {},
},
annotation: "haproxy.router.openshift.io/timeout",
pattern: `[1-9][0-9]*(us|ms|s|m|h|d)?`,
// valid value is the last one to force all the iterations
values: []string{"", "", "", "", "30s"},
},
}
for _, tc := range testCases {
b.ResetTimer()
b.Run(tc.name, func(b *testing.B) {
for n := 0; n < b.N; n++ {
maxTimeoutFirstMatchedAndClipped(tc.state, tc.annotation, tc.pattern, tc.values...)
}
})
}
}