Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement temp client banning on failed requests #38

Merged
merged 2 commits into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions internal/prometheus/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ type collectNodeResponse struct {

func (c *Collector) collectNode(ch chan<- prometheus.Metric, node proxmox.GetNodesData, resultChan chan<- collectNodeResponse, wg *sync.WaitGroup) {
defer wg.Done()
defer logger.Logger.Debug("finished requests for node data", "node", node.Node)
var resp collectNodeResponse

// Collect metrics that just need node data
Expand Down
59 changes: 55 additions & 4 deletions internal/proxmox/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,17 @@ var (
// ClusterName gets populated with the proxmox cluster's cluster name on clustered PVE instances
ClusterName string

clients map[string]*proxmox.Client
cash *cache.Cache
clients map[string]wrappedClient
banDuration = time.Duration(1 * time.Minute)
cash *cache.Cache
)

type wrappedClient struct {
client *proxmox.Client
banned bool
bannedUntil time.Time
}

// Init constructs a proxmox API client for this package taking in a token
func Init(endpoints []string, tokenID, token string, tlsVerify bool) error {
// Fail early if endpoints slice is 0 length
Expand All @@ -39,7 +46,7 @@ func Init(endpoints []string, tokenID, token string, tlsVerify bool) error {
}

// Make and init proxmox client map
clients = make(map[string]*proxmox.Client)
clients = make(map[string]wrappedClient)
for _, endpoint := range endpoints {
// Parse URL for hostname
parsedURL, err := url.Parse(endpoint)
Expand All @@ -59,14 +66,19 @@ func Init(endpoints []string, tokenID, token string, tlsVerify bool) error {
}

// Add client to map
clients[hostname] = c
clients[hostname] = wrappedClient{
client: c,
}
}

// init cache -- at longest, cache will live for 29 seconds
// which should ensure metrics are updated if scraping in 30 second intervals
// TODO should cache lifespan and cache expiration intervals be user configurable?
cash = cache.New(24*time.Second, 5*time.Second)

// Maintain client bans
go refreshClientBans()

retrieveClusterName()

return nil
Expand Down Expand Up @@ -95,3 +107,42 @@ func retrieveClusterName() {
log.Logger.Info("discovered PVE cluster", "cluster", ClusterName)
}
}

// refreshClientBans iterates over the configured clients and checks if their ban is still valid over time
func refreshClientBans() {
for {
// Loop through clients from client map
for name, c := range clients {
// Check if the client is banned -- if banned we need to check if the client's banUntil time has expired
if c.banned && time.Now().After(c.bannedUntil) {
// If the ban expired, make a request, see if it succeeds, and unban it if successful. Increase the ban timer if not
_, _, err := c.client.Nodes.GetNodes()
if err == nil {
// Unban client - request successful
log.Logger.Debug("unbanning client, test request successful", "name", name)
clients[name] = wrappedClient{
client: c.client,
}
continue
} else {
// Re-up ban timer - request failed
log.Logger.Debug("re-upping ban on client, test request failed", "name", name, "error", err)
banClient(name, c)
continue
}
}
}

time.Sleep(5 * time.Second)
}
}

// banClient bans a client for the defined duration
func banClient(name string, c wrappedClient) {
log.Logger.Debug("banning client", "name", name, "duration", banDuration)
clients[name] = wrappedClient{
client: c.client,
banned: true,
bannedUntil: time.Now().Add(banDuration),
}
}
17 changes: 15 additions & 2 deletions internal/proxmox/cluster.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package proxmox

import (
"fmt"

"github.com/patrickmn/go-cache"
proxmox "github.com/starttoaster/go-proxmox"
log "github.com/starttoaster/proxmox-exporter/internal/logger"
Expand All @@ -21,16 +23,27 @@ func GetClusterStatus() (*proxmox.GetClusterStatusResponse, error) {

// Make request if not found in cache
var err error
for _, c := range clients {
cluster, _, err = c.Cluster.GetClusterStatus()
for clientName, c := range clients {
// Check if client was banned, skip if is
if c.banned {
continue
}

cluster, _, err = c.client.Cluster.GetClusterStatus()
if err == nil {
break
} else {
banClient(clientName, c)
}
}
if err != nil {
return nil, err
}

if cluster == nil {
return nil, fmt.Errorf("request to get cluster status was not successful. It's possible all clients are banned")
}

// Update cache
cash.Set("GetClusterStatus", cluster, cache.NoExpiration)

Expand Down
105 changes: 91 additions & 14 deletions internal/proxmox/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,27 @@ func GetNodes() (*proxmox.GetNodesResponse, error) {

// Make request if not found in cache
var err error
for _, c := range clients {
nodes, _, err = c.Nodes.GetNodes()
for name, c := range clients {
// Check if client was banned, skip if is
if c.banned {
continue
}

nodes, _, err = c.client.Nodes.GetNodes()
if err == nil {
break
} else {
banClient(name, c)
}
}
if err != nil {
return nil, err
}

if nodes == nil {
return nil, fmt.Errorf("request to get node list was not successful. It's possible all clients are banned")
}

// Update cache
cash.Set("GetNodes", nodes, cache.DefaultExpiration)

Expand All @@ -54,16 +65,27 @@ func GetNodeStatus(name string) (*proxmox.GetNodeStatusResponse, error) {

// Make request if not found in cache
var err error
for _, c := range clients {
node, _, err = c.Nodes.GetNodeStatus(name)
for clientName, c := range clients {
// Check if client was banned, skip if is
if c.banned {
continue
}

node, _, err = c.client.Nodes.GetNodeStatus(name)
if err == nil {
break
} else {
banClient(clientName, c)
}
}
if err != nil {
return nil, err
}

if node == nil {
return nil, fmt.Errorf("request to get node status was not successful. It's possible all clients are banned")
}

// Update cache
cash.Set(fmt.Sprintf("GetNodeStatus_%s", name), node, cache.DefaultExpiration)

Expand All @@ -85,16 +107,27 @@ func GetNodeQemu(name string) (*proxmox.GetNodeQemuResponse, error) {

// Make request if not found in cache
var err error
for _, c := range clients {
vms, _, err = c.Nodes.GetNodeQemu(name)
for clientName, c := range clients {
// Check if client was banned, skip if is
if c.banned {
continue
}

vms, _, err = c.client.Nodes.GetNodeQemu(name)
if err == nil {
break
} else {
banClient(clientName, c)
}
}
if err != nil {
return nil, err
}

if vms == nil {
return nil, fmt.Errorf("request to get node VMs was not successful. It's possible all clients are banned")
}

// Update per-node cache since we have it
cash.Set(fmt.Sprintf("GetNodeQemu_%s", name), vms, cache.DefaultExpiration)

Expand All @@ -116,16 +149,27 @@ func GetNodeLxc(name string) (*proxmox.GetNodeLxcResponse, error) {

// Make request if not found in cache
var err error
for _, c := range clients {
lxcs, _, err = c.Nodes.GetNodeLxc(name)
for clientName, c := range clients {
// Check if client was banned, skip if is
if c.banned {
continue
}

lxcs, _, err = c.client.Nodes.GetNodeLxc(name)
if err == nil {
break
} else {
banClient(clientName, c)
}
}
if err != nil {
return nil, err
}

if lxcs == nil {
return nil, fmt.Errorf("request to get node LXCs was not successful. It's possible all clients are banned")
}

// Update per-node cache since we have it
cash.Set(fmt.Sprintf("GetNodeLxc_%s", name), lxcs, cache.DefaultExpiration)

Expand All @@ -147,16 +191,27 @@ func GetNodeDisksList(name string) (*proxmox.GetNodeDisksListResponse, error) {

// Make request if not found in cache
var err error
for _, c := range clients {
disks, _, err = c.Nodes.GetNodeDisksList(name)
for clientName, c := range clients {
// Check if client was banned, skip if is
if c.banned {
continue
}

disks, _, err = c.client.Nodes.GetNodeDisksList(name)
if err == nil {
break
} else {
banClient(clientName, c)
}
}
if err != nil {
return nil, err
}

if disks == nil {
return nil, fmt.Errorf("request to get node disks was not successful. It's possible all clients are banned")
}

// Update per-node cache since we have it
cash.Set(fmt.Sprintf("GetNodeDisksList_%s", name), disks, cache.DefaultExpiration)

Expand All @@ -178,16 +233,27 @@ func GetNodeCertificatesInfo(name string) (*proxmox.GetNodeCertificatesInfoRespo

// Make request if not found in cache
var err error
for _, c := range clients {
certs, _, err = c.Nodes.GetNodeCertificatesInfo(name)
for clientName, c := range clients {
// Check if client was banned, skip if is
if c.banned {
continue
}

certs, _, err = c.client.Nodes.GetNodeCertificatesInfo(name)
if err == nil {
break
} else {
banClient(clientName, c)
}
}
if err != nil {
return nil, err
}

if certs == nil {
return nil, fmt.Errorf("request to get node certificates was not successful. It's possible all clients are banned")
}

// Update per-node cache since we have it
cash.Set(fmt.Sprintf("GetNodeCertificatesInfo_%s", name), certs, cache.DefaultExpiration)

Expand All @@ -209,16 +275,27 @@ func GetNodeStorage(name string) (*proxmox.GetNodeStorageResponse, error) {

// Make request if not found in cache
var err error
for _, c := range clients {
store, _, err = c.Nodes.GetNodeStorage(name)
for clientName, c := range clients {
// Check if client was banned, skip if is
if c.banned {
continue
}

store, _, err = c.client.Nodes.GetNodeStorage(name)
if err == nil {
break
} else {
banClient(clientName, c)
}
}
if err != nil {
return nil, err
}

if store == nil {
return nil, fmt.Errorf("request to get node storage was not successful. It's possible all clients are banned")
}

// Update per-node cache since we have it
cash.Set(fmt.Sprintf("GetNodeStorage_%s", name), store, cache.DefaultExpiration)

Expand Down