-
Notifications
You must be signed in to change notification settings - Fork 2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Client APIs for reporting resource usage of host and allocations #1189
Changes from all commits
1cae57a
01e0ae7
a485a38
50250b1
eda53e3
f390261
445b181
fe8f640
7d8196d
1f12e90
3f0c42c
72c60d6
0fdff61
c85b4de
9a851a1
ea1370d
30cbfe1
f693545
cbe6f6a
7f016a7
0af3e7e
9806867
b7158be
df68129
3f336f4
16f298f
3dc28bd
458b701
3192e31
6132ccc
33f2d0c
aee9db0
1183037
4491c2b
b273eb8
b755ab9
2b1f389
95a3ca8
cf8861e
bf6c034
584c1e3
3a2cce2
31af4e0
73f0594
3a8b152
f59ad3d
2c52338
71d3361
c99733e
7870912
25be2da
1c635ea
b51b08b
03c9d94
f765e82
f16191c
455c759
2571ea6
0b868e0
68ec1f3
15e79c3
06f8d58
4611540
0782803
993675d
951b553
c760d59
e00e203
f1a5348
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,7 @@ import ( | |
"github.com/hashicorp/nomad/client/consul" | ||
"github.com/hashicorp/nomad/client/driver" | ||
"github.com/hashicorp/nomad/client/fingerprint" | ||
"github.com/hashicorp/nomad/client/stats" | ||
"github.com/hashicorp/nomad/nomad" | ||
"github.com/hashicorp/nomad/nomad/structs" | ||
"github.com/mitchellh/hashstructure" | ||
|
@@ -76,11 +77,27 @@ const ( | |
// DefaultConfig returns the default configuration | ||
func DefaultConfig() *config.Config { | ||
return &config.Config{ | ||
LogOutput: os.Stderr, | ||
Region: "global", | ||
LogOutput: os.Stderr, | ||
Region: "global", | ||
StatsDataPoints: 60, | ||
StatsCollectionInterval: 1 * time.Second, | ||
} | ||
} | ||
|
||
// ClientStatsReporter exposes all the APIs related to resource usage of a Nomad | ||
// Client | ||
type ClientStatsReporter interface { | ||
// AllocStats returns a map of alloc ids and their corresponding stats | ||
// collector | ||
AllocStats() map[string]AllocStatsReporter | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comments |
||
|
||
// HostStats returns resource usage stats for the host | ||
HostStats() []*stats.HostStats | ||
|
||
// HostStatsTS returns a time series of host resource usage stats | ||
HostStatsTS(since int64) []*stats.HostStats | ||
} | ||
|
||
// Client is used to implement the client interaction with Nomad. Clients | ||
// are expected to register as a schedulable node to the servers, and to | ||
// run allocations as determined by the servers. | ||
|
@@ -116,6 +133,11 @@ type Client struct { | |
|
||
consulService *consul.ConsulService | ||
|
||
// HostStatsCollector collects host resource usage stats | ||
hostStatsCollector *stats.HostStatsCollector | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comments There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you get rid of this in the struct and just instantiate it in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The advantage of this being here is that if we can't create the host stats collector for some reason the client won't start. If we move this down, the client will run but we might not be able to collect stats after the client has started. And I think we shouldn't run the client if we can't create the stats collector since down the line, resource usage stats might feed into scheduling decisions. |
||
resourceUsage *stats.RingBuff | ||
resourceUsageLock sync.RWMutex | ||
|
||
shutdown bool | ||
shutdownCh chan struct{} | ||
shutdownLock sync.Mutex | ||
|
@@ -126,15 +148,22 @@ func NewClient(cfg *config.Config) (*Client, error) { | |
// Create a logger | ||
logger := log.New(cfg.LogOutput, "", log.LstdFlags) | ||
|
||
resourceUsage, err := stats.NewRingBuff(cfg.StatsDataPoints) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
// Create the client | ||
c := &Client{ | ||
config: cfg, | ||
start: time.Now(), | ||
connPool: nomad.NewPool(cfg.LogOutput, clientRPCCache, clientMaxStreams, nil), | ||
logger: logger, | ||
allocs: make(map[string]*AllocRunner), | ||
allocUpdates: make(chan *structs.Allocation, 64), | ||
shutdownCh: make(chan struct{}), | ||
config: cfg, | ||
start: time.Now(), | ||
connPool: nomad.NewPool(cfg.LogOutput, clientRPCCache, clientMaxStreams, nil), | ||
logger: logger, | ||
hostStatsCollector: stats.NewHostStatsCollector(), | ||
resourceUsage: resourceUsage, | ||
allocs: make(map[string]*AllocRunner), | ||
allocUpdates: make(chan *structs.Allocation, 64), | ||
shutdownCh: make(chan struct{}), | ||
} | ||
|
||
// Initialize the client | ||
|
@@ -189,6 +218,9 @@ func NewClient(cfg *config.Config) (*Client, error) { | |
// Start the client! | ||
go c.run() | ||
|
||
// Start collecting stats | ||
go c.collectHostStats() | ||
|
||
// Start the consul sync | ||
go c.syncConsul() | ||
|
||
|
@@ -394,6 +426,67 @@ func (c *Client) Node() *structs.Node { | |
return c.config.Node | ||
} | ||
|
||
// StatsReporter exposes the various APIs related resource usage of a Nomad | ||
// client | ||
func (c *Client) StatsReporter() ClientStatsReporter { | ||
return c | ||
} | ||
|
||
// AllocStats returns all the stats reporter of the allocations running on a | ||
// Nomad client | ||
func (c *Client) AllocStats() map[string]AllocStatsReporter { | ||
res := make(map[string]AllocStatsReporter) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use the lock when iterating over the allocs There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @dadgar I think I should use the |
||
allocRunners := c.getAllocRunners() | ||
for alloc, ar := range allocRunners { | ||
res[alloc] = ar | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
} | ||
return res | ||
} | ||
|
||
// HostStats returns all the stats related to a Nomad client | ||
func (c *Client) HostStats() []*stats.HostStats { | ||
c.resourceUsageLock.RLock() | ||
defer c.resourceUsageLock.RUnlock() | ||
val := c.resourceUsage.Peek() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if val is nil? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Then we return |
||
ru, _ := val.(*stats.HostStats) | ||
return []*stats.HostStats{ru} | ||
} | ||
|
||
func (c *Client) HostStatsTS(since int64) []*stats.HostStats { | ||
c.resourceUsageLock.RLock() | ||
defer c.resourceUsageLock.RUnlock() | ||
|
||
values := c.resourceUsage.Values() | ||
low := 0 | ||
high := len(values) - 1 | ||
var idx int | ||
|
||
for { | ||
mid := (low + high) >> 1 | ||
midVal, _ := values[mid].(*stats.HostStats) | ||
if midVal.Timestamp < since { | ||
low = mid + 1 | ||
} else if midVal.Timestamp > since { | ||
high = mid - 1 | ||
} else if midVal.Timestamp == since { | ||
idx = mid | ||
break | ||
} | ||
if low > high { | ||
idx = low | ||
break | ||
} | ||
} | ||
values = values[idx:] | ||
ts := make([]*stats.HostStats, len(values)) | ||
for index, val := range values { | ||
ru, _ := val.(*stats.HostStats) | ||
ts[index] = ru | ||
} | ||
return ts | ||
|
||
} | ||
|
||
// GetAllocFS returns the AllocFS interface for the alloc dir of an allocation | ||
func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) { | ||
ar, ok := c.allocs[allocID] | ||
|
@@ -1227,3 +1320,27 @@ func (c *Client) syncConsul() { | |
|
||
} | ||
} | ||
|
||
// collectHostStats collects host resource usage stats periodically | ||
func (c *Client) collectHostStats() { | ||
// Start collecting host stats right away and then keep collecting every | ||
// collection interval | ||
next := time.NewTimer(0) | ||
defer next.Stop() | ||
for { | ||
select { | ||
case <-next.C: | ||
ru, err := c.hostStatsCollector.Collect() | ||
if err != nil { | ||
c.logger.Printf("[DEBUG] client: error fetching host resource usage stats: %v", err) | ||
continue | ||
} | ||
c.resourceUsageLock.RLock() | ||
c.resourceUsage.Enqueue(ru) | ||
c.resourceUsageLock.RUnlock() | ||
next.Reset(c.config.StatsCollectionInterval) | ||
case <-c.shutdownCh: | ||
return | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use the lock