From 61a567e73e3f515c93fa2eb33345416d6ac54a14 Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Mon, 29 Apr 2019 18:18:51 +0000 Subject: [PATCH] cmd/coordinator: only bound old revdial builds Revert CL 173517 and replace with a similar but different mechanism. Now that the new revdial is out, only penalize the old revdial users (a few stragglers who haven't updated). In practice the limit of 10 at once won't be a problem but will protect the coordinator during submit floods. Fixes golang/go#31639 Change-Id: I6b6c3567205fdd98e0b80def96d75827e986fe4f Reviewed-on: https://go-review.googlesource.com/c/build/+/174325 Reviewed-by: Dmitri Shuralyov --- buildenv/envs.go | 7 ---- cmd/coordinator/coordinator.go | 15 --------- cmd/coordinator/reverse.go | 60 ++++++++++++++++++++++++++-------- 3 files changed, 47 insertions(+), 35 deletions(-) diff --git a/buildenv/envs.go b/buildenv/envs.go index 98a2fd3afe..7199b2f309 100644 --- a/buildenv/envs.go +++ b/buildenv/envs.go @@ -127,12 +127,6 @@ type Environment struct { // in a development or staging environment. MaxBuilds int - // MaxReverseBuilds is the maximum number of concurrent builds - // that can be run for reverse buildlets. Zero means unlimited. - // This is being used to mitigate revdial memory consumption - // problems in issue 31639. - MaxReverseBuilds int - // AutoCertCacheBucket is the GCS bucket to use for the // golang.org/x/crypto/acme/autocert (LetsEncrypt) cache. // If empty, LetsEncrypt isn't used. @@ -291,7 +285,6 @@ var Production = &Environment{ LogBucket: "go-build-log", SnapBucket: "go-build-snap", AutoCertCacheBucket: "farmer-golang-org-autocert-cache", - MaxReverseBuilds: 10, } var Development = &Environment{ diff --git a/cmd/coordinator/coordinator.go b/cmd/coordinator/coordinator.go index 6c310d38b6..5a9a89559b 100644 --- a/cmd/coordinator/coordinator.go +++ b/cmd/coordinator/coordinator.go @@ -448,17 +448,6 @@ func numCurrentBuildsOfType(typ string) (n int) { return } -func numCurrentReverseBuilds() (n int) { - statusMu.Lock() - defer statusMu.Unlock() - for _, bs := range status { - if bs.conf.IsReverse() { - n++ - } - } - return -} - func isBuilding(work buildgo.BuilderRev) bool { statusMu.Lock() defer statusMu.Unlock() @@ -488,10 +477,6 @@ func mayBuildRev(rev buildgo.BuilderRev) bool { if buildEnv.MaxBuilds > 0 && numCurrentBuilds() >= buildEnv.MaxBuilds { return false } - if buildEnv.MaxReverseBuilds > 0 && buildConf.IsReverse() && numCurrentReverseBuilds() >= buildEnv.MaxReverseBuilds { - // Issue 31639. - return false - } if buildConf.MaxAtOnce > 0 && numCurrentBuildsOfType(rev.Name) >= buildConf.MaxAtOnce { return false } diff --git a/cmd/coordinator/reverse.go b/cmd/coordinator/reverse.go index f289b5575d..beba59a00e 100644 --- a/cmd/coordinator/reverse.go +++ b/cmd/coordinator/reverse.go @@ -53,16 +53,31 @@ import ( const minBuildletVersion = 1 -var reversePool = new(reverseBuildletPool) +var reversePool = &reverseBuildletPool{ + oldInUse: make(map[*buildlet.Client]bool), +} + +const maxOldRevdialUsers = 10 type token struct{} type reverseBuildletPool struct { - mu sync.Mutex // guards all fields, including fields of *reverseBuildlet + // mu guards all 4 fields below and also fields of + // *reverseBuildlet in buildlets + mu sync.Mutex + + // buildlets are the currently connected buildlets. // TODO: switch to a map[hostType][]buildlets or map of set. buildlets []*reverseBuildlet - wakeChan map[string]chan token // hostType => best-effort wake-up chan when buildlet free - waiters map[string]int // hostType => number waiters blocked in GetBuildlet + + wakeChan map[string]chan token // hostType => best-effort wake-up chan when buildlet free + + waiters map[string]int // hostType => number waiters blocked in GetBuildlet + + // oldInUse tracks which buildlets with the old revdial code are currently in use. + // These are a liability due to runaway memory issues (Issue 31639) so + // we bound how many can be running at once. Fortunately there aren't many left. + oldInUse map[*buildlet.Client]bool } func (p *reverseBuildletPool) ServeReverseStatusJSON(w http.ResponseWriter, r *http.Request) { @@ -126,9 +141,15 @@ func (p *reverseBuildletPool) tryToGrab(hostType string) (bc *buildlet.Client, b busy++ continue } + if b.isOldRevDial && len(p.oldInUse) >= maxOldRevdialUsers { + continue + } // Found an unused match. b.inUse = true b.inUseTime = time.Now() + if b.isOldRevDial { + p.oldInUse[b.client] = true + } return b.client, 0 } return nil, busy @@ -162,6 +183,7 @@ func (p *reverseBuildletPool) noteBuildletAvailable(hostType string) { func (p *reverseBuildletPool) nukeBuildlet(victim *buildlet.Client) { p.mu.Lock() defer p.mu.Unlock() + delete(p.oldInUse, victim) for i, rb := range p.buildlets { if rb.client == victim { defer rb.conn.Close() @@ -345,10 +367,12 @@ func (p *reverseBuildletPool) WriteHTMLStatus(w io.Writer) { p.mu.Lock() buildlets := append([]*reverseBuildlet(nil), p.buildlets...) sort.Sort(byTypeThenHostname(buildlets)) + numInUse := 0 for _, b := range buildlets { machStatus := "idle" if b.inUse { machStatus = "working" + numInUse++ } fmt.Fprintf(&buf, "
  • %s (%s) version %s, %s: connected %s, %s for %s
  • \n", b.hostname, @@ -363,6 +387,8 @@ func (p *reverseBuildletPool) WriteHTMLStatus(w io.Writer) { inUse[b.hostType]++ } } + numOldInUse := len(p.oldInUse) + numConnected := len(buildlets) p.mu.Unlock() var typs []string @@ -371,7 +397,13 @@ func (p *reverseBuildletPool) WriteHTMLStatus(w io.Writer) { } sort.Strings(typs) - io.WriteString(w, "Reverse pool summary (in use / total)