From 19147f518ead57a89ac4e544ccbded54521d19dc Mon Sep 17 00:00:00 2001 From: Maycon Santos Date: Wed, 17 Jul 2024 23:48:37 +0200 Subject: [PATCH] Add faster availability DNS probe and update test domain to .com (#2280) * Add faster availability DNS probe and update test domain to .com - Count success queries and compare it before doing after network map probes. - Reduce the first dns probe to 500ms - Updated test domain with com instead of . due to Palo alto DNS proxy server issues * use fqdn * Update client/internal/dns/upstream.go Co-authored-by: Viktor Liu <17948409+lixmal@users.noreply.github.com> --------- Co-authored-by: Viktor Liu <17948409+lixmal@users.noreply.github.com> --- client/internal/dns/upstream.go | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/client/internal/dns/upstream.go b/client/internal/dns/upstream.go index b502bf5eb1b..b3baf2fa8fd 100644 --- a/client/internal/dns/upstream.go +++ b/client/internal/dns/upstream.go @@ -24,7 +24,7 @@ const ( probeTimeout = 2 * time.Second ) -const testRecord = "." +const testRecord = "com." type upstreamClient interface { exchange(ctx context.Context, upstream string, r *dns.Msg) (*dns.Msg, time.Duration, error) @@ -42,6 +42,7 @@ type upstreamResolverBase struct { upstreamServers []string disabled bool failsCount atomic.Int32 + successCount atomic.Int32 failsTillDeact int32 mutex sync.Mutex reactivatePeriod time.Duration @@ -124,6 +125,7 @@ func (u *upstreamResolverBase) ServeDNS(w dns.ResponseWriter, r *dns.Msg) { return } + u.successCount.Add(1) log.Tracef("took %s to query the upstream %s", t, upstream) err = w.WriteMsg(rm) @@ -172,6 +174,11 @@ func (u *upstreamResolverBase) probeAvailability() { default: } + // avoid probe if upstreams could resolve at least one query and fails count is less than failsTillDeact + if u.successCount.Load() > 0 && u.failsCount.Load() < u.failsTillDeact { + return + } + var success bool var mu sync.Mutex var wg sync.WaitGroup @@ -183,7 +190,7 @@ func (u *upstreamResolverBase) probeAvailability() { wg.Add(1) go func() { defer wg.Done() - err := u.testNameserver(upstream) + err := u.testNameserver(upstream, 500*time.Millisecond) if err != nil { errors = multierror.Append(errors, err) log.Warnf("probing upstream nameserver %s: %s", upstream, err) @@ -224,7 +231,7 @@ func (u *upstreamResolverBase) waitUntilResponse() { } for _, upstream := range u.upstreamServers { - if err := u.testNameserver(upstream); err != nil { + if err := u.testNameserver(upstream, probeTimeout); err != nil { log.Tracef("upstream check for %s: %s", upstream, err) } else { // at least one upstream server is available, stop probing @@ -244,6 +251,7 @@ func (u *upstreamResolverBase) waitUntilResponse() { log.Infof("upstreams %s are responsive again. Adding them back to system", u.upstreamServers) u.failsCount.Store(0) + u.successCount.Add(1) u.reactivate() u.disabled = false } @@ -265,13 +273,14 @@ func (u *upstreamResolverBase) disable(err error) { } log.Warnf("Upstream resolving is Disabled for %v", reactivatePeriod) + u.successCount.Store(0) u.deactivate(err) u.disabled = true go u.waitUntilResponse() } -func (u *upstreamResolverBase) testNameserver(server string) error { - ctx, cancel := context.WithTimeout(u.ctx, probeTimeout) +func (u *upstreamResolverBase) testNameserver(server string, timeout time.Duration) error { + ctx, cancel := context.WithTimeout(u.ctx, timeout) defer cancel() r := new(dns.Msg).SetQuestion(testRecord, dns.TypeSOA)