From cc11d9a56357f2bded20fa7dd36475e2807254d8 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Wed, 10 May 2017 17:39:45 -0700 Subject: [PATCH 1/6] Add new gc_max_allocs tuneable More than gc_max_allocs may be running on a node, but terminal allocs will be garbage collected to try to keep the total number below the limit. --- client/client.go | 19 ++- client/config/config.go | 5 + client/gc.go | 94 ++++++++--- client/gc_test.go | 152 ++++++++++++++---- client/task_runner_test.go | 5 +- command/agent/agent.go | 1 + command/agent/config-test-fixtures/basic.hcl | 1 + command/agent/config.go | 15 +- command/agent/config_parse.go | 1 + .../docs/agent/configuration/client.html.md | 6 + 10 files changed, 234 insertions(+), 65 deletions(-) diff --git a/client/client.go b/client/client.go index 59460bf1199..c9beebae4ac 100644 --- a/client/client.go +++ b/client/client.go @@ -240,13 +240,15 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic // Add the garbage collector gcConfig := &GCConfig{ + MaxAllocs: cfg.GCMaxAllocs, DiskUsageThreshold: cfg.GCDiskUsageThreshold, InodeUsageThreshold: cfg.GCInodeUsageThreshold, Interval: cfg.GCInterval, ParallelDestroys: cfg.GCParallelDestroys, ReservedDiskMB: cfg.Node.Reserved.DiskMB, } - c.garbageCollector = NewAllocGarbageCollector(logger, statsCollector, gcConfig) + c.garbageCollector = NewAllocGarbageCollector(logger, statsCollector, c, gcConfig) + go c.garbageCollector.Run() // Setup the node if err := c.setupNode(); err != nil { @@ -482,17 +484,13 @@ func (c *Client) RPC(method string, args interface{}, reply interface{}) error { // Stats is used to return statistics for debugging and insight // for various sub-systems func (c *Client) Stats() map[string]map[string]string { - c.allocLock.RLock() - numAllocs := len(c.allocs) - c.allocLock.RUnlock() - c.heartbeatLock.Lock() defer c.heartbeatLock.Unlock() stats := map[string]map[string]string{ "client": map[string]string{ "node_id": c.Node().ID, "known_servers": c.servers.all().String(), - "num_allocations": strconv.Itoa(numAllocs), + "num_allocations": strconv.Itoa(c.NumAllocs()), "last_heartbeat": fmt.Sprintf("%v", time.Since(c.lastHeartbeat)), "heartbeat_ttl": fmt.Sprintf("%v", c.heartbeatTTL), }, @@ -722,6 +720,15 @@ func (c *Client) getAllocRunners() map[string]*AllocRunner { return runners } +// NumAllocs returns the number of allocs this client has. Used to +// fulfill the AllocCounter interface for the GC. +func (c *Client) NumAllocs() int { + c.allocLock.RLock() + n := len(c.allocs) + c.allocLock.RUnlock() + return n +} + // nodeID restores, or generates if necessary, a unique node ID and SecretID. // The node ID is, if available, a persistent unique ID. The secret ID is a // high-entropy random UUID. diff --git a/client/config/config.go b/client/config/config.go index 24e115f35db..8d137948b90 100644 --- a/client/config/config.go +++ b/client/config/config.go @@ -171,6 +171,10 @@ type Config struct { // beyond which the Nomad client triggers GC of the terminal allocations GCInodeUsageThreshold float64 + // GCMaxAllocs is the maximum number of allocations a node can have + // before garbage collection is triggered. + GCMaxAllocs int + // LogLevel is the level of the logs to putout LogLevel string @@ -205,6 +209,7 @@ func DefaultConfig() *Config { GCParallelDestroys: 2, GCDiskUsageThreshold: 80, GCInodeUsageThreshold: 70, + GCMaxAllocs: 200, } } diff --git a/client/gc.go b/client/gc.go index a07db1415a8..d6b58d77782 100644 --- a/client/gc.go +++ b/client/gc.go @@ -18,6 +18,9 @@ const ( // GCConfig allows changing the behaviour of the garbage collector type GCConfig struct { + // MaxAllocs is the maximum number of allocations to track before a GC + // is triggered. + MaxAllocs int DiskUsageThreshold float64 InodeUsageThreshold float64 Interval time.Duration @@ -25,10 +28,17 @@ type GCConfig struct { ParallelDestroys int } +// AllocCounter is used by AllocGarbageCollector to discover how many +// allocations a node has and is generally fulfilled by the Client. +type AllocCounter interface { + NumAllocs() int +} + // AllocGarbageCollector garbage collects terminated allocations on a node type AllocGarbageCollector struct { allocRunners *IndexedGCAllocPQ statsCollector stats.NodeStatsCollector + allocCounter AllocCounter config *GCConfig logger *log.Logger destroyCh chan struct{} @@ -36,8 +46,9 @@ type AllocGarbageCollector struct { } // NewAllocGarbageCollector returns a garbage collector for terminated -// allocations on a node. -func NewAllocGarbageCollector(logger *log.Logger, statsCollector stats.NodeStatsCollector, config *GCConfig) *AllocGarbageCollector { +// allocations on a node. Must call Run() in a goroutine enable periodic +// garbage collection. +func NewAllocGarbageCollector(logger *log.Logger, statsCollector stats.NodeStatsCollector, ac AllocCounter, config *GCConfig) *AllocGarbageCollector { // Require at least 1 to make progress if config.ParallelDestroys <= 0 { logger.Printf("[WARN] client: garbage collector defaulting parallism to 1 due to invalid input value of %d", config.ParallelDestroys) @@ -47,17 +58,18 @@ func NewAllocGarbageCollector(logger *log.Logger, statsCollector stats.NodeStats gc := &AllocGarbageCollector{ allocRunners: NewIndexedGCAllocPQ(), statsCollector: statsCollector, + allocCounter: ac, config: config, logger: logger, destroyCh: make(chan struct{}, config.ParallelDestroys), shutdownCh: make(chan struct{}), } - go gc.run() return gc } -func (a *AllocGarbageCollector) run() { +// Run the periodic garbage collector. +func (a *AllocGarbageCollector) Run() { ticker := time.NewTicker(a.config.Interval) for { select { @@ -100,23 +112,33 @@ func (a *AllocGarbageCollector) keepUsageBelowThreshold() error { break } - if diskStats.UsedPercent <= a.config.DiskUsageThreshold && - diskStats.InodesUsedPercent <= a.config.InodeUsageThreshold { + reason := "" + + switch { + case diskStats.UsedPercent > a.config.DiskUsageThreshold: + reason = fmt.Sprintf("disk usage of %.0f is over gc threshold of %.0f", + diskStats.UsedPercent, a.config.DiskUsageThreshold) + case diskStats.InodesUsedPercent > a.config.InodeUsageThreshold: + reason = fmt.Sprintf("inode usage of %.0f is over gc threshold of %.0f", + diskStats.InodesUsedPercent, a.config.InodeUsageThreshold) + case a.numAllocs() > a.config.MaxAllocs: + reason = fmt.Sprintf("number of allocations is over the limit (%d)", a.config.MaxAllocs) + } + + // No reason to gc, exit + if reason == "" { break } // Collect an allocation gcAlloc := a.allocRunners.Pop() if gcAlloc == nil { + a.logger.Printf("[WARN] client: garbage collection due to %s skipped because no terminal allocations", reason) break } - ar := gcAlloc.allocRunner - alloc := ar.Alloc() - a.logger.Printf("[INFO] client: garbage collecting allocation %v", alloc.ID) - // Destroy the alloc runner and wait until it exits - a.destroyAllocRunner(ar) + a.destroyAllocRunner(gcAlloc.allocRunner, reason) } return nil } @@ -124,7 +146,13 @@ func (a *AllocGarbageCollector) keepUsageBelowThreshold() error { // destroyAllocRunner is used to destroy an allocation runner. It will acquire a // lock to restrict parallelism and then destroy the alloc runner, returning // once the allocation has been destroyed. -func (a *AllocGarbageCollector) destroyAllocRunner(ar *AllocRunner) { +func (a *AllocGarbageCollector) destroyAllocRunner(ar *AllocRunner, reason string) { + id := "" + if alloc := ar.Alloc(); alloc != nil { + id = alloc.ID + } + a.logger.Printf("[INFO] client: garbage collecting allocation %s due to %s", id, reason) + // Acquire the destroy lock select { case <-a.shutdownCh: @@ -155,11 +183,7 @@ func (a *AllocGarbageCollector) Collect(allocID string) error { if err != nil { return fmt.Errorf("unable to collect allocation %q: %v", allocID, err) } - - ar := gcAlloc.allocRunner - a.logger.Printf("[INFO] client: garbage collecting allocation %q", ar.Alloc().ID) - - a.destroyAllocRunner(ar) + a.destroyAllocRunner(gcAlloc.allocRunner, "forced collection") return nil } @@ -177,9 +201,7 @@ func (a *AllocGarbageCollector) CollectAll() error { break } - ar := gcAlloc.allocRunner - a.logger.Printf("[INFO] client: garbage collecting alloc runner for alloc %q", ar.Alloc().ID) - go a.destroyAllocRunner(ar) + go a.destroyAllocRunner(gcAlloc.allocRunner, "forced full collection") } return nil } @@ -187,6 +209,26 @@ func (a *AllocGarbageCollector) CollectAll() error { // MakeRoomFor garbage collects enough number of allocations in the terminal // state to make room for new allocations func (a *AllocGarbageCollector) MakeRoomFor(allocations []*structs.Allocation) error { + // GC allocs until below the max limit + the new allocations + max := a.config.MaxAllocs - len(allocations) + for a.numAllocs() > max { + select { + case <-a.shutdownCh: + return nil + default: + } + + gcAlloc := a.allocRunners.Pop() + if gcAlloc == nil { + // It's fine if we can't lower below the limit here as + // we'll keep trying to drop below the limit with each + // periodic gc + break + } + + // Destroy the alloc runner and wait until it exits + a.destroyAllocRunner(gcAlloc.allocRunner, "new allocations") + } totalResource := &structs.Resources{} for _, alloc := range allocations { if err := totalResource.Add(alloc.Resources); err != nil { @@ -244,10 +286,9 @@ func (a *AllocGarbageCollector) MakeRoomFor(allocations []*structs.Allocation) e ar := gcAlloc.allocRunner alloc := ar.Alloc() - a.logger.Printf("[INFO] client: garbage collecting allocation %v", alloc.ID) // Destroy the alloc runner and wait until it exits - a.destroyAllocRunner(ar) + a.destroyAllocRunner(ar, fmt.Sprintf("freeing %d MB for new allocations", alloc.Resources.DiskMB)) // Call stats collect again diskCleared += alloc.Resources.DiskMB @@ -261,8 +302,7 @@ func (a *AllocGarbageCollector) MarkForCollection(ar *AllocRunner) error { return fmt.Errorf("nil allocation runner inserted for garbage collection") } if ar.Alloc() == nil { - a.logger.Printf("[INFO] client: alloc is nil, so garbage collecting") - a.destroyAllocRunner(ar) + a.destroyAllocRunner(ar, "alloc is nil") } a.logger.Printf("[INFO] client: marking allocation %v for GC", ar.Alloc().ID) @@ -281,6 +321,12 @@ func (a *AllocGarbageCollector) Remove(ar *AllocRunner) { } } +// numAllocs returns the total number of allocs tracked by the client as well +// as those marked for GC. +func (a *AllocGarbageCollector) numAllocs() int { + return a.allocRunners.Length() + a.allocCounter.NumAllocs() +} + // GCAlloc wraps an allocation runner and an index enabling it to be used within // a PQ type GCAlloc struct { diff --git a/client/gc_test.go b/client/gc_test.go index f4fdedfbfcc..e132a8e9f8b 100644 --- a/client/gc_test.go +++ b/client/gc_test.go @@ -1,8 +1,6 @@ package client import ( - "log" - "os" "testing" "time" @@ -11,11 +9,14 @@ import ( "github.com/hashicorp/nomad/nomad/structs" ) -var gcConfig = GCConfig{ - DiskUsageThreshold: 80, - InodeUsageThreshold: 70, - Interval: 1 * time.Minute, - ReservedDiskMB: 0, +func gcConfig() *GCConfig { + return &GCConfig{ + DiskUsageThreshold: 80, + InodeUsageThreshold: 70, + Interval: 1 * time.Minute, + ReservedDiskMB: 0, + MaxAllocs: 100, + } } func TestIndexedGCAllocPQ(t *testing.T) { @@ -57,6 +58,15 @@ func TestIndexedGCAllocPQ(t *testing.T) { } } +// MockAllocCounter implements AllocCounter interface. +type MockAllocCounter struct { + allocs int +} + +func (m *MockAllocCounter) NumAllocs() int { + return m.allocs +} + type MockStatsCollector struct { availableValues []uint64 usedPercents []float64 @@ -90,8 +100,8 @@ func (m *MockStatsCollector) Stats() *stats.HostStats { } func TestAllocGarbageCollector_MarkForCollection(t *testing.T) { - logger := log.New(os.Stdout, "", 0) - gc := NewAllocGarbageCollector(logger, &MockStatsCollector{}, &gcConfig) + logger := testLogger() + gc := NewAllocGarbageCollector(logger, &MockStatsCollector{}, &MockAllocCounter{}, gcConfig()) _, ar1 := testAllocRunnerFromAlloc(mock.Alloc(), false) if err := gc.MarkForCollection(ar1); err != nil { @@ -105,8 +115,8 @@ func TestAllocGarbageCollector_MarkForCollection(t *testing.T) { } func TestAllocGarbageCollector_Collect(t *testing.T) { - logger := log.New(os.Stdout, "", 0) - gc := NewAllocGarbageCollector(logger, &MockStatsCollector{}, &gcConfig) + logger := testLogger() + gc := NewAllocGarbageCollector(logger, &MockStatsCollector{}, &MockAllocCounter{}, gcConfig()) _, ar1 := testAllocRunnerFromAlloc(mock.Alloc(), false) _, ar2 := testAllocRunnerFromAlloc(mock.Alloc(), false) @@ -131,8 +141,8 @@ func TestAllocGarbageCollector_Collect(t *testing.T) { } func TestAllocGarbageCollector_CollectAll(t *testing.T) { - logger := log.New(os.Stdout, "", 0) - gc := NewAllocGarbageCollector(logger, &MockStatsCollector{}, &gcConfig) + logger := testLogger() + gc := NewAllocGarbageCollector(logger, &MockStatsCollector{}, &MockAllocCounter{}, gcConfig()) _, ar1 := testAllocRunnerFromAlloc(mock.Alloc(), false) _, ar2 := testAllocRunnerFromAlloc(mock.Alloc(), false) @@ -153,10 +163,11 @@ func TestAllocGarbageCollector_CollectAll(t *testing.T) { } func TestAllocGarbageCollector_MakeRoomForAllocations_EnoughSpace(t *testing.T) { - logger := log.New(os.Stdout, "", 0) + logger := testLogger() statsCollector := &MockStatsCollector{} - gcConfig.ReservedDiskMB = 20 - gc := NewAllocGarbageCollector(logger, statsCollector, &gcConfig) + conf := gcConfig() + conf.ReservedDiskMB = 20 + gc := NewAllocGarbageCollector(logger, statsCollector, &MockAllocCounter{}, conf) _, ar1 := testAllocRunnerFromAlloc(mock.Alloc(), false) close(ar1.waitCh) @@ -190,10 +201,11 @@ func TestAllocGarbageCollector_MakeRoomForAllocations_EnoughSpace(t *testing.T) } func TestAllocGarbageCollector_MakeRoomForAllocations_GC_Partial(t *testing.T) { - logger := log.New(os.Stdout, "", 0) + logger := testLogger() statsCollector := &MockStatsCollector{} - gcConfig.ReservedDiskMB = 20 - gc := NewAllocGarbageCollector(logger, statsCollector, &gcConfig) + conf := gcConfig() + conf.ReservedDiskMB = 20 + gc := NewAllocGarbageCollector(logger, statsCollector, &MockAllocCounter{}, conf) _, ar1 := testAllocRunnerFromAlloc(mock.Alloc(), false) close(ar1.waitCh) @@ -228,10 +240,11 @@ func TestAllocGarbageCollector_MakeRoomForAllocations_GC_Partial(t *testing.T) { } func TestAllocGarbageCollector_MakeRoomForAllocations_GC_All(t *testing.T) { - logger := log.New(os.Stdout, "", 0) + logger := testLogger() statsCollector := &MockStatsCollector{} - gcConfig.ReservedDiskMB = 20 - gc := NewAllocGarbageCollector(logger, statsCollector, &gcConfig) + conf := gcConfig() + conf.ReservedDiskMB = 20 + gc := NewAllocGarbageCollector(logger, statsCollector, &MockAllocCounter{}, conf) _, ar1 := testAllocRunnerFromAlloc(mock.Alloc(), false) close(ar1.waitCh) @@ -262,10 +275,11 @@ func TestAllocGarbageCollector_MakeRoomForAllocations_GC_All(t *testing.T) { } func TestAllocGarbageCollector_MakeRoomForAllocations_GC_Fallback(t *testing.T) { - logger := log.New(os.Stdout, "", 0) + logger := testLogger() statsCollector := &MockStatsCollector{} - gcConfig.ReservedDiskMB = 20 - gc := NewAllocGarbageCollector(logger, statsCollector, &gcConfig) + conf := gcConfig() + conf.ReservedDiskMB = 20 + gc := NewAllocGarbageCollector(logger, statsCollector, &MockAllocCounter{}, conf) _, ar1 := testAllocRunnerFromAlloc(mock.Alloc(), false) close(ar1.waitCh) @@ -294,11 +308,49 @@ func TestAllocGarbageCollector_MakeRoomForAllocations_GC_Fallback(t *testing.T) } } +func TestAllocGarbageCollector_MakeRoomForAllocations_MaxAllocs(t *testing.T) { + const ( + liveAllocs = 3 + maxAllocs = 6 + gcAllocs = 4 + gcAllocsLeft = 1 + ) + + logger := testLogger() + statsCollector := &MockStatsCollector{ + availableValues: []uint64{10 * 1024 * MB}, + usedPercents: []float64{0}, + inodePercents: []float64{0}, + } + allocCounter := &MockAllocCounter{allocs: liveAllocs} + conf := gcConfig() + conf.MaxAllocs = maxAllocs + gc := NewAllocGarbageCollector(logger, statsCollector, allocCounter, conf) + + for i := 0; i < gcAllocs; i++ { + _, ar := testAllocRunnerFromAlloc(mock.Alloc(), false) + close(ar.waitCh) + if err := gc.MarkForCollection(ar); err != nil { + t.Fatalf("error marking alloc for gc: %v", err) + } + } + + if err := gc.MakeRoomFor([]*structs.Allocation{mock.Alloc(), mock.Alloc()}); err != nil { + t.Fatalf("error making room for 2 new allocs: %v", err) + } + + // There should be gcAllocsLeft alloc runners left to be collected + if n := len(gc.allocRunners.index); n != gcAllocsLeft { + t.Fatalf("expected %d remaining GC-able alloc runners but found %d", gcAllocsLeft, n) + } +} + func TestAllocGarbageCollector_UsageBelowThreshold(t *testing.T) { - logger := log.New(os.Stdout, "", 0) + logger := testLogger() statsCollector := &MockStatsCollector{} - gcConfig.ReservedDiskMB = 20 - gc := NewAllocGarbageCollector(logger, statsCollector, &gcConfig) + conf := gcConfig() + conf.ReservedDiskMB = 20 + gc := NewAllocGarbageCollector(logger, statsCollector, &MockAllocCounter{}, conf) _, ar1 := testAllocRunnerFromAlloc(mock.Alloc(), false) close(ar1.waitCh) @@ -329,10 +381,11 @@ func TestAllocGarbageCollector_UsageBelowThreshold(t *testing.T) { } func TestAllocGarbageCollector_UsedPercentThreshold(t *testing.T) { - logger := log.New(os.Stdout, "", 0) + logger := testLogger() statsCollector := &MockStatsCollector{} - gcConfig.ReservedDiskMB = 20 - gc := NewAllocGarbageCollector(logger, statsCollector, &gcConfig) + conf := gcConfig() + conf.ReservedDiskMB = 20 + gc := NewAllocGarbageCollector(logger, statsCollector, &MockAllocCounter{}, conf) _, ar1 := testAllocRunnerFromAlloc(mock.Alloc(), false) close(ar1.waitCh) @@ -363,3 +416,40 @@ func TestAllocGarbageCollector_UsedPercentThreshold(t *testing.T) { t.Fatalf("gcAlloc: %v", gcAlloc) } } + +func TestAllocGarbageCollector_MaxAllocsThreshold(t *testing.T) { + const ( + liveAllocs = 3 + maxAllocs = 6 + gcAllocs = 4 + gcAllocsLeft = 1 + ) + + logger := testLogger() + statsCollector := &MockStatsCollector{ + availableValues: []uint64{1000}, + usedPercents: []float64{0}, + inodePercents: []float64{0}, + } + allocCounter := &MockAllocCounter{allocs: liveAllocs} + conf := gcConfig() + conf.MaxAllocs = 4 + gc := NewAllocGarbageCollector(logger, statsCollector, allocCounter, conf) + + for i := 0; i < gcAllocs; i++ { + _, ar := testAllocRunnerFromAlloc(mock.Alloc(), false) + close(ar.waitCh) + if err := gc.MarkForCollection(ar); err != nil { + t.Fatalf("error marking alloc for gc: %v", err) + } + } + + if err := gc.keepUsageBelowThreshold(); err != nil { + t.Fatalf("error gc'ing: %v", err) + } + + // We should have gc'd down to MaxAllocs + if n := len(gc.allocRunners.index); n != gcAllocsLeft { + t.Fatalf("expected remaining gc allocs (%d) to equal %d", n, gcAllocsLeft) + } +} diff --git a/client/task_runner_test.go b/client/task_runner_test.go index cfac5ea4256..51117d8d0f7 100644 --- a/client/task_runner_test.go +++ b/client/task_runner_test.go @@ -31,7 +31,10 @@ func testLogger() *log.Logger { } func prefixedTestLogger(prefix string) *log.Logger { - return log.New(os.Stderr, prefix, log.LstdFlags) + if testing.Verbose() { + return log.New(os.Stderr, prefix, log.LstdFlags) + } + return log.New(ioutil.Discard, "", 0) } type MockTaskStateUpdater struct { diff --git a/command/agent/agent.go b/command/agent/agent.go index d09ee4147b9..45912712fc3 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -321,6 +321,7 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) { conf.GCParallelDestroys = a.config.Client.GCParallelDestroys conf.GCDiskUsageThreshold = a.config.Client.GCDiskUsageThreshold conf.GCInodeUsageThreshold = a.config.Client.GCInodeUsageThreshold + conf.GCMaxAllocs = a.config.Client.GCMaxAllocs conf.NoHostUUID = a.config.Client.NoHostUUID return conf, nil diff --git a/command/agent/config-test-fixtures/basic.hcl b/command/agent/config-test-fixtures/basic.hcl index 8d4880a7d27..d75ca579cfb 100644 --- a/command/agent/config-test-fixtures/basic.hcl +++ b/command/agent/config-test-fixtures/basic.hcl @@ -58,6 +58,7 @@ client { gc_parallel_destroys = 6 gc_disk_usage_threshold = 82 gc_inode_usage_threshold = 91 + gc_max_allocs = 200 no_host_uuid = true } server { diff --git a/command/agent/config.go b/command/agent/config.go index cccd095f101..e8ee9117dee 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -209,14 +209,18 @@ type ClientConfig struct { // collector will allow. GCParallelDestroys int `mapstructure:"gc_parallel_destroys"` - // GCInodeUsageThreshold is the inode usage threshold beyond which the Nomad - // client triggers GC of the terminal allocations + // GCDiskUsageThreshold is the disk usage threshold given as a percent + // beyond which the Nomad client triggers GC of terminal allocations GCDiskUsageThreshold float64 `mapstructure:"gc_disk_usage_threshold"` // GCInodeUsageThreshold is the inode usage threshold beyond which the Nomad // client triggers GC of the terminal allocations GCInodeUsageThreshold float64 `mapstructure:"gc_inode_usage_threshold"` + // GCMaxAllocs is the maximum number of allocations a node can have + // before garbage collection is triggered. + GCMaxAllocs int `mapstructure:"gc_max_allocs"` + // NoHostUUID disables using the host's UUID and will force generation of a // random UUID. NoHostUUID bool `mapstructure:"no_host_uuid"` @@ -503,6 +507,7 @@ func DevConfig() *Config { conf.Client.GCInterval = 10 * time.Minute conf.Client.GCDiskUsageThreshold = 99 conf.Client.GCInodeUsageThreshold = 99 + conf.Client.GCMaxAllocs = 200 return conf } @@ -532,8 +537,9 @@ func DefaultConfig() *Config { Reserved: &Resources{}, GCInterval: 1 * time.Minute, GCParallelDestroys: 2, - GCInodeUsageThreshold: 70, GCDiskUsageThreshold: 80, + GCInodeUsageThreshold: 70, + GCMaxAllocs: 200, }, Server: &ServerConfig{ Enabled: false, @@ -949,6 +955,9 @@ func (a *ClientConfig) Merge(b *ClientConfig) *ClientConfig { if b.GCInodeUsageThreshold != 0 { result.GCInodeUsageThreshold = b.GCInodeUsageThreshold } + if b.GCMaxAllocs != 0 { + result.GCMaxAllocs = b.GCMaxAllocs + } if b.NoHostUUID { result.NoHostUUID = b.NoHostUUID } diff --git a/command/agent/config_parse.go b/command/agent/config_parse.go index 403f5b75b5c..1d0f387bf27 100644 --- a/command/agent/config_parse.go +++ b/command/agent/config_parse.go @@ -346,6 +346,7 @@ func parseClient(result **ClientConfig, list *ast.ObjectList) error { "gc_disk_usage_threshold", "gc_inode_usage_threshold", "gc_parallel_destroys", + "gc_max_allocs", "no_host_uuid", } if err := checkHCLKeys(listVal, valid); err != nil { diff --git a/website/source/docs/agent/configuration/client.html.md b/website/source/docs/agent/configuration/client.html.md index 1ce6fe77fbe..9d29da485d5 100644 --- a/website/source/docs/agent/configuration/client.html.md +++ b/website/source/docs/agent/configuration/client.html.md @@ -100,6 +100,12 @@ client { - `gc_inode_usage_threshold` `(float: 70)` - Specifies the inode usage percent which Nomad tries to maintain by garbage collecting terminal allocations. +- `gc_max_allocs` `(int: 200)` - Specifies the maximum number of allocations + which a client will track before triggering a garbage collection of terminal + allocations. This will *not* limit the number of allocations a node can run at + a time, however after `gc_max_allocs` every new allocation will cause terminal + allocations to be GC'd. + - `gc_parallel_destroys` `(int: 2)` - Specifies the maximum number of parallel destroys allowed by the garbage collector. This value should be relatively low to avoid high resource usage during garbage collections. From e3c1d35111876d4d69a78c7877b54d95c6a16f7a Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 12 May 2017 15:57:27 -0700 Subject: [PATCH 2/6] Lower default gc_max_allocs to 50 --- client/config/config.go | 2 +- command/agent/config-test-fixtures/basic.hcl | 2 +- command/agent/config.go | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/client/config/config.go b/client/config/config.go index 8d137948b90..db4d8233c75 100644 --- a/client/config/config.go +++ b/client/config/config.go @@ -209,7 +209,7 @@ func DefaultConfig() *Config { GCParallelDestroys: 2, GCDiskUsageThreshold: 80, GCInodeUsageThreshold: 70, - GCMaxAllocs: 200, + GCMaxAllocs: 50, } } diff --git a/command/agent/config-test-fixtures/basic.hcl b/command/agent/config-test-fixtures/basic.hcl index d75ca579cfb..006a4340c2c 100644 --- a/command/agent/config-test-fixtures/basic.hcl +++ b/command/agent/config-test-fixtures/basic.hcl @@ -58,7 +58,7 @@ client { gc_parallel_destroys = 6 gc_disk_usage_threshold = 82 gc_inode_usage_threshold = 91 - gc_max_allocs = 200 + gc_max_allocs = 50 no_host_uuid = true } server { diff --git a/command/agent/config.go b/command/agent/config.go index e8ee9117dee..a96202d3dd4 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -507,7 +507,7 @@ func DevConfig() *Config { conf.Client.GCInterval = 10 * time.Minute conf.Client.GCDiskUsageThreshold = 99 conf.Client.GCInodeUsageThreshold = 99 - conf.Client.GCMaxAllocs = 200 + conf.Client.GCMaxAllocs = 50 return conf } @@ -539,7 +539,7 @@ func DefaultConfig() *Config { GCParallelDestroys: 2, GCDiskUsageThreshold: 80, GCInodeUsageThreshold: 70, - GCMaxAllocs: 200, + GCMaxAllocs: 50, }, Server: &ServerConfig{ Enabled: false, From fb72f20bb165a021242522d1d2ccdf8cb389f7a3 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 12 May 2017 16:03:22 -0700 Subject: [PATCH 3/6] gc_max_allocs should include blocked & migrating --- client/client.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/client/client.go b/client/client.go index c9beebae4ac..a4b751b24e4 100644 --- a/client/client.go +++ b/client/client.go @@ -724,7 +724,13 @@ func (c *Client) getAllocRunners() map[string]*AllocRunner { // fulfill the AllocCounter interface for the GC. func (c *Client) NumAllocs() int { c.allocLock.RLock() + c.blockedAllocsLock.Lock() + c.migratingAllocsLock.Lock() n := len(c.allocs) + n += len(c.blockedAllocations) + n += len(c.migratingAllocs) + c.migratingAllocsLock.Unlock() + c.blockedAllocsLock.Unlock() c.allocLock.RUnlock() return n } From f6ea22302c628545e146230541b5754db7615c47 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Tue, 30 May 2017 11:39:12 -0700 Subject: [PATCH 4/6] Update docs to match gc_max_allocs default --- website/source/docs/agent/configuration/client.html.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/source/docs/agent/configuration/client.html.md b/website/source/docs/agent/configuration/client.html.md index 9d29da485d5..b817c154b04 100644 --- a/website/source/docs/agent/configuration/client.html.md +++ b/website/source/docs/agent/configuration/client.html.md @@ -100,7 +100,7 @@ client { - `gc_inode_usage_threshold` `(float: 70)` - Specifies the inode usage percent which Nomad tries to maintain by garbage collecting terminal allocations. -- `gc_max_allocs` `(int: 200)` - Specifies the maximum number of allocations +- `gc_max_allocs` `(int: 50)` - Specifies the maximum number of allocations which a client will track before triggering a garbage collection of terminal allocations. This will *not* limit the number of allocations a node can run at a time, however after `gc_max_allocs` every new allocation will cause terminal From f2476cfa671f63a1eeaae90d9fa8667cbae0eb63 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Tue, 30 May 2017 11:39:26 -0700 Subject: [PATCH 5/6] Fix config parsing test Went overboard before I realized there's only one test case. --- command/agent/config_parse_test.go | 31 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/command/agent/config_parse_test.go b/command/agent/config_parse_test.go index 9774b0db3fd..adff7035b72 100644 --- a/command/agent/config_parse_test.go +++ b/command/agent/config_parse_test.go @@ -3,10 +3,12 @@ package agent import ( "path/filepath" "reflect" + "strings" "testing" "time" "github.com/hashicorp/nomad/nomad/structs/config" + "github.com/kr/pretty" ) func TestConfig_Parse(t *testing.T) { @@ -75,6 +77,7 @@ func TestConfig_Parse(t *testing.T) { GCParallelDestroys: 6, GCDiskUsageThreshold: 82, GCInodeUsageThreshold: 91, + GCMaxAllocs: 50, NoHostUUID: true, }, Server: &ServerConfig{ @@ -165,22 +168,20 @@ func TestConfig_Parse(t *testing.T) { } for _, tc := range cases { - t.Logf("Testing parse: %s", tc.File) + t.Run(tc.File, func(t *testing.T) { + path, err := filepath.Abs(filepath.Join("./config-test-fixtures", tc.File)) + if err != nil { + t.Fatalf("file: %s\n\n%s", tc.File, err) + } - path, err := filepath.Abs(filepath.Join("./config-test-fixtures", tc.File)) - if err != nil { - t.Fatalf("file: %s\n\n%s", tc.File, err) - continue - } + actual, err := ParseConfigFile(path) + if (err != nil) != tc.Err { + t.Fatalf("file: %s\n\n%s", tc.File, err) + } - actual, err := ParseConfigFile(path) - if (err != nil) != tc.Err { - t.Fatalf("file: %s\n\n%s", tc.File, err) - continue - } - - if !reflect.DeepEqual(actual, tc.Result) { - t.Fatalf("file: %s\n\n%#v\n\n%#v", tc.File, actual, tc.Result) - } + if !reflect.DeepEqual(actual, tc.Result) { + t.Errorf("file: %s diff: (actual vs expected)\n\n%s", tc.File, strings.Join(pretty.Diff(actual, tc.Result), "\n")) + } + }) } } From a42b7383a2f55e21a59e990419fb208191150b15 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Tue, 30 May 2017 15:16:14 -0700 Subject: [PATCH 6/6] Add #2636 to changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ac26f1ab5e..98573c0ecf6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ IMPROVEMENTS: * api/job: Ability to revert job to older versions [GH-2575] * client: Environment variables for client DC and Region [GH-2507] * client: Hash host ID so its stable and well distributed [GH-2541] + * client: GC dead allocs if total allocs > `gc_max_allocs` tunable [GH-2636] * client: Persist state using bolt-db and more efficient write patterns [GH-2610] * client: Fingerprint all routable addresses on an interface including IPv6