diff --git a/client/consul_template.go b/client/consul_template.go index ea70cb40391..0103a5055ae 100644 --- a/client/consul_template.go +++ b/client/consul_template.go @@ -355,7 +355,7 @@ func parseTemplateConfigs(tmpls []*structs.Template, taskDir string, taskEnv *en dest = filepath.Join(taskDir, taskEnv.ReplaceEnv(tmpl.DestPath)) } - ct := &ctconf.ConfigTemplate{ + ct := ctconf.ConfigTemplate{ Source: src, Destination: dest, EmbeddedTemplate: tmpl.EmbeddedTmpl, @@ -363,7 +363,7 @@ func parseTemplateConfigs(tmpls []*structs.Template, taskDir string, taskEnv *en Wait: &watch.Wait{}, } - ctmpls[*ct] = tmpl + ctmpls[ct] = tmpl } return ctmpls diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index a3c98cd099d..2eb487ac5bc 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -1753,12 +1753,12 @@ func (sc *ServiceCheck) validate() error { switch sc.InitialStatus { case "": - case api.HealthUnknown: + // case api.HealthUnknown: TODO: Add when Consul releases 0.7.1 case api.HealthPassing: case api.HealthWarning: case api.HealthCritical: default: - return fmt.Errorf(`invalid initial check state (%s), must be one of %q, %q, %q, %q or empty`, sc.InitialStatus, api.HealthUnknown, api.HealthPassing, api.HealthWarning, api.HealthCritical) + return fmt.Errorf(`invalid initial check state (%s), must be one of %q, %q, %q, %q or empty`, sc.InitialStatus, api.HealthPassing, api.HealthWarning, api.HealthCritical) } diff --git a/vendor/github.com/hashicorp/consul/acl/acl.go b/vendor/github.com/hashicorp/consul/acl/acl.go new file mode 100644 index 00000000000..f13dc5b5693 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/acl/acl.go @@ -0,0 +1,476 @@ +package acl + +import ( + "github.com/armon/go-radix" +) + +var ( + // allowAll is a singleton policy which allows all + // non-management actions + allowAll ACL + + // denyAll is a singleton policy which denies all actions + denyAll ACL + + // manageAll is a singleton policy which allows all + // actions, including management + manageAll ACL +) + +func init() { + // Setup the singletons + allowAll = &StaticACL{ + allowManage: false, + defaultAllow: true, + } + denyAll = &StaticACL{ + allowManage: false, + defaultAllow: false, + } + manageAll = &StaticACL{ + allowManage: true, + defaultAllow: true, + } +} + +// ACL is the interface for policy enforcement. +type ACL interface { + // KeyRead checks for permission to read a given key + KeyRead(string) bool + + // KeyWrite checks for permission to write a given key + KeyWrite(string) bool + + // KeyWritePrefix checks for permission to write to an + // entire key prefix. This means there must be no sub-policies + // that deny a write. + KeyWritePrefix(string) bool + + // ServiceWrite checks for permission to read a given service + ServiceWrite(string) bool + + // ServiceRead checks for permission to read a given service + ServiceRead(string) bool + + // EventRead determines if a specific event can be queried. + EventRead(string) bool + + // EventWrite determines if a specific event may be fired. + EventWrite(string) bool + + // PrepardQueryRead determines if a specific prepared query can be read + // to show its contents (this is not used for execution). + PreparedQueryRead(string) bool + + // PreparedQueryWrite determines if a specific prepared query can be + // created, modified, or deleted. + PreparedQueryWrite(string) bool + + // KeyringRead determines if the encryption keyring used in + // the gossip layer can be read. + KeyringRead() bool + + // KeyringWrite determines if the keyring can be manipulated + KeyringWrite() bool + + // OperatorRead determines if the read-only Consul operator functions + // can be used. + OperatorRead() bool + + // OperatorWrite determines if the state-changing Consul operator + // functions can be used. + OperatorWrite() bool + + // ACLList checks for permission to list all the ACLs + ACLList() bool + + // ACLModify checks for permission to manipulate ACLs + ACLModify() bool +} + +// StaticACL is used to implement a base ACL policy. It either +// allows or denies all requests. This can be used as a parent +// ACL to act in a blacklist or whitelist mode. +type StaticACL struct { + allowManage bool + defaultAllow bool +} + +func (s *StaticACL) KeyRead(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) KeyWrite(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) KeyWritePrefix(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) ServiceRead(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) ServiceWrite(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) EventRead(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) EventWrite(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) PreparedQueryRead(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) PreparedQueryWrite(string) bool { + return s.defaultAllow +} + +func (s *StaticACL) KeyringRead() bool { + return s.defaultAllow +} + +func (s *StaticACL) KeyringWrite() bool { + return s.defaultAllow +} + +func (s *StaticACL) OperatorRead() bool { + return s.defaultAllow +} + +func (s *StaticACL) OperatorWrite() bool { + return s.defaultAllow +} + +func (s *StaticACL) ACLList() bool { + return s.allowManage +} + +func (s *StaticACL) ACLModify() bool { + return s.allowManage +} + +// AllowAll returns an ACL rule that allows all operations +func AllowAll() ACL { + return allowAll +} + +// DenyAll returns an ACL rule that denies all operations +func DenyAll() ACL { + return denyAll +} + +// ManageAll returns an ACL rule that can manage all resources +func ManageAll() ACL { + return manageAll +} + +// RootACL returns a possible ACL if the ID matches a root policy +func RootACL(id string) ACL { + switch id { + case "allow": + return allowAll + case "deny": + return denyAll + case "manage": + return manageAll + default: + return nil + } +} + +// PolicyACL is used to wrap a set of ACL policies to provide +// the ACL interface. +type PolicyACL struct { + // parent is used to resolve policy if we have + // no matching rule. + parent ACL + + // keyRules contains the key policies + keyRules *radix.Tree + + // serviceRules contains the service policies + serviceRules *radix.Tree + + // eventRules contains the user event policies + eventRules *radix.Tree + + // preparedQueryRules contains the prepared query policies + preparedQueryRules *radix.Tree + + // keyringRule contains the keyring policies. The keyring has + // a very simple yes/no without prefix matching, so here we + // don't need to use a radix tree. + keyringRule string + + // operatorRule contains the operator policies. + operatorRule string +} + +// New is used to construct a policy based ACL from a set of policies +// and a parent policy to resolve missing cases. +func New(parent ACL, policy *Policy) (*PolicyACL, error) { + p := &PolicyACL{ + parent: parent, + keyRules: radix.New(), + serviceRules: radix.New(), + eventRules: radix.New(), + preparedQueryRules: radix.New(), + } + + // Load the key policy + for _, kp := range policy.Keys { + p.keyRules.Insert(kp.Prefix, kp.Policy) + } + + // Load the service policy + for _, sp := range policy.Services { + p.serviceRules.Insert(sp.Name, sp.Policy) + } + + // Load the event policy + for _, ep := range policy.Events { + p.eventRules.Insert(ep.Event, ep.Policy) + } + + // Load the prepared query policy + for _, pq := range policy.PreparedQueries { + p.preparedQueryRules.Insert(pq.Prefix, pq.Policy) + } + + // Load the keyring policy + p.keyringRule = policy.Keyring + + // Load the operator policy + p.operatorRule = policy.Operator + + return p, nil +} + +// KeyRead returns if a key is allowed to be read +func (p *PolicyACL) KeyRead(key string) bool { + // Look for a matching rule + _, rule, ok := p.keyRules.LongestPrefix(key) + if ok { + switch rule.(string) { + case PolicyRead, PolicyWrite: + return true + default: + return false + } + } + + // No matching rule, use the parent. + return p.parent.KeyRead(key) +} + +// KeyWrite returns if a key is allowed to be written +func (p *PolicyACL) KeyWrite(key string) bool { + // Look for a matching rule + _, rule, ok := p.keyRules.LongestPrefix(key) + if ok { + switch rule.(string) { + case PolicyWrite: + return true + default: + return false + } + } + + // No matching rule, use the parent. + return p.parent.KeyWrite(key) +} + +// KeyWritePrefix returns if a prefix is allowed to be written +func (p *PolicyACL) KeyWritePrefix(prefix string) bool { + // Look for a matching rule that denies + _, rule, ok := p.keyRules.LongestPrefix(prefix) + if ok && rule.(string) != PolicyWrite { + return false + } + + // Look if any of our children have a deny policy + deny := false + p.keyRules.WalkPrefix(prefix, func(path string, rule interface{}) bool { + // We have a rule to prevent a write in a sub-directory! + if rule.(string) != PolicyWrite { + deny = true + return true + } + return false + }) + + // Deny the write if any sub-rules may be violated + if deny { + return false + } + + // If we had a matching rule, done + if ok { + return true + } + + // No matching rule, use the parent. + return p.parent.KeyWritePrefix(prefix) +} + +// ServiceRead checks if reading (discovery) of a service is allowed +func (p *PolicyACL) ServiceRead(name string) bool { + // Check for an exact rule or catch-all + _, rule, ok := p.serviceRules.LongestPrefix(name) + + if ok { + switch rule { + case PolicyRead, PolicyWrite: + return true + default: + return false + } + } + + // No matching rule, use the parent. + return p.parent.ServiceRead(name) +} + +// ServiceWrite checks if writing (registering) a service is allowed +func (p *PolicyACL) ServiceWrite(name string) bool { + // Check for an exact rule or catch-all + _, rule, ok := p.serviceRules.LongestPrefix(name) + + if ok { + switch rule { + case PolicyWrite: + return true + default: + return false + } + } + + // No matching rule, use the parent. + return p.parent.ServiceWrite(name) +} + +// EventRead is used to determine if the policy allows for a +// specific user event to be read. +func (p *PolicyACL) EventRead(name string) bool { + // Longest-prefix match on event names + if _, rule, ok := p.eventRules.LongestPrefix(name); ok { + switch rule { + case PolicyRead, PolicyWrite: + return true + default: + return false + } + } + + // Nothing matched, use parent + return p.parent.EventRead(name) +} + +// EventWrite is used to determine if new events can be created +// (fired) by the policy. +func (p *PolicyACL) EventWrite(name string) bool { + // Longest-prefix match event names + if _, rule, ok := p.eventRules.LongestPrefix(name); ok { + return rule == PolicyWrite + } + + // No match, use parent + return p.parent.EventWrite(name) +} + +// PreparedQueryRead checks if reading (listing) of a prepared query is +// allowed - this isn't execution, just listing its contents. +func (p *PolicyACL) PreparedQueryRead(prefix string) bool { + // Check for an exact rule or catch-all + _, rule, ok := p.preparedQueryRules.LongestPrefix(prefix) + + if ok { + switch rule { + case PolicyRead, PolicyWrite: + return true + default: + return false + } + } + + // No matching rule, use the parent. + return p.parent.PreparedQueryRead(prefix) +} + +// PreparedQueryWrite checks if writing (creating, updating, or deleting) of a +// prepared query is allowed. +func (p *PolicyACL) PreparedQueryWrite(prefix string) bool { + // Check for an exact rule or catch-all + _, rule, ok := p.preparedQueryRules.LongestPrefix(prefix) + + if ok { + switch rule { + case PolicyWrite: + return true + default: + return false + } + } + + // No matching rule, use the parent. + return p.parent.PreparedQueryWrite(prefix) +} + +// KeyringRead is used to determine if the keyring can be +// read by the current ACL token. +func (p *PolicyACL) KeyringRead() bool { + switch p.keyringRule { + case PolicyRead, PolicyWrite: + return true + case PolicyDeny: + return false + default: + return p.parent.KeyringRead() + } +} + +// KeyringWrite determines if the keyring can be manipulated. +func (p *PolicyACL) KeyringWrite() bool { + if p.keyringRule == PolicyWrite { + return true + } + return p.parent.KeyringWrite() +} + +// OperatorRead determines if the read-only operator functions are allowed. +func (p *PolicyACL) OperatorRead() bool { + switch p.operatorRule { + case PolicyRead, PolicyWrite: + return true + case PolicyDeny: + return false + default: + return p.parent.OperatorRead() + } +} + +// OperatorWrite determines if the state-changing operator functions are +// allowed. +func (p *PolicyACL) OperatorWrite() bool { + if p.operatorRule == PolicyWrite { + return true + } + return p.parent.OperatorWrite() +} + +// ACLList checks if listing of ACLs is allowed +func (p *PolicyACL) ACLList() bool { + return p.parent.ACLList() +} + +// ACLModify checks if modification of ACLs is allowed +func (p *PolicyACL) ACLModify() bool { + return p.parent.ACLModify() +} diff --git a/vendor/github.com/hashicorp/consul/acl/cache.go b/vendor/github.com/hashicorp/consul/acl/cache.go new file mode 100644 index 00000000000..0387f9fbe9b --- /dev/null +++ b/vendor/github.com/hashicorp/consul/acl/cache.go @@ -0,0 +1,177 @@ +package acl + +import ( + "crypto/md5" + "fmt" + + "github.com/hashicorp/golang-lru" +) + +// FaultFunc is a function used to fault in the parent, +// rules for an ACL given its ID +type FaultFunc func(id string) (string, string, error) + +// aclEntry allows us to store the ACL with it's policy ID +type aclEntry struct { + ACL ACL + Parent string + RuleID string +} + +// Cache is used to implement policy and ACL caching +type Cache struct { + faultfn FaultFunc + aclCache *lru.TwoQueueCache // Cache id -> acl + policyCache *lru.TwoQueueCache // Cache policy -> acl + ruleCache *lru.TwoQueueCache // Cache rules -> policy +} + +// NewCache constructs a new policy and ACL cache of a given size +func NewCache(size int, faultfn FaultFunc) (*Cache, error) { + if size <= 0 { + return nil, fmt.Errorf("Must provide positive cache size") + } + + rc, err := lru.New2Q(size) + if err != nil { + return nil, err + } + + pc, err := lru.New2Q(size) + if err != nil { + return nil, err + } + + ac, err := lru.New2Q(size) + if err != nil { + return nil, err + } + + c := &Cache{ + faultfn: faultfn, + aclCache: ac, + policyCache: pc, + ruleCache: rc, + } + return c, nil +} + +// GetPolicy is used to get a potentially cached policy set. +// If not cached, it will be parsed, and then cached. +func (c *Cache) GetPolicy(rules string) (*Policy, error) { + return c.getPolicy(RuleID(rules), rules) +} + +// getPolicy is an internal method to get a cached policy, +// but it assumes a pre-computed ID +func (c *Cache) getPolicy(id, rules string) (*Policy, error) { + raw, ok := c.ruleCache.Get(id) + if ok { + return raw.(*Policy), nil + } + policy, err := Parse(rules) + if err != nil { + return nil, err + } + policy.ID = id + c.ruleCache.Add(id, policy) + return policy, nil + +} + +// RuleID is used to generate an ID for a rule +func RuleID(rules string) string { + return fmt.Sprintf("%x", md5.Sum([]byte(rules))) +} + +// policyID returns the cache ID for a policy +func (c *Cache) policyID(parent, ruleID string) string { + return parent + ":" + ruleID +} + +// GetACLPolicy is used to get the potentially cached ACL +// policy. If not cached, it will be generated and then cached. +func (c *Cache) GetACLPolicy(id string) (string, *Policy, error) { + // Check for a cached acl + if raw, ok := c.aclCache.Get(id); ok { + cached := raw.(aclEntry) + if raw, ok := c.ruleCache.Get(cached.RuleID); ok { + return cached.Parent, raw.(*Policy), nil + } + } + + // Fault in the rules + parent, rules, err := c.faultfn(id) + if err != nil { + return "", nil, err + } + + // Get cached + policy, err := c.GetPolicy(rules) + return parent, policy, err +} + +// GetACL is used to get a potentially cached ACL policy. +// If not cached, it will be generated and then cached. +func (c *Cache) GetACL(id string) (ACL, error) { + // Look for the ACL directly + raw, ok := c.aclCache.Get(id) + if ok { + return raw.(aclEntry).ACL, nil + } + + // Get the rules + parentID, rules, err := c.faultfn(id) + if err != nil { + return nil, err + } + ruleID := RuleID(rules) + + // Check for a compiled ACL + policyID := c.policyID(parentID, ruleID) + var compiled ACL + if raw, ok := c.policyCache.Get(policyID); ok { + compiled = raw.(ACL) + } else { + // Get the policy + policy, err := c.getPolicy(ruleID, rules) + if err != nil { + return nil, err + } + + // Get the parent ACL + parent := RootACL(parentID) + if parent == nil { + parent, err = c.GetACL(parentID) + if err != nil { + return nil, err + } + } + + // Compile the ACL + acl, err := New(parent, policy) + if err != nil { + return nil, err + } + + // Cache the compiled ACL + c.policyCache.Add(policyID, acl) + compiled = acl + } + + // Cache and return the ACL + c.aclCache.Add(id, aclEntry{compiled, parentID, ruleID}) + return compiled, nil +} + +// ClearACL is used to clear the ACL cache if any +func (c *Cache) ClearACL(id string) { + c.aclCache.Remove(id) +} + +// Purge is used to clear all the ACL caches. The +// rule and policy caches are not purged, since they +// are content-hashed anyways. +func (c *Cache) Purge() { + c.aclCache.Purge() +} diff --git a/vendor/github.com/hashicorp/consul/acl/policy.go b/vendor/github.com/hashicorp/consul/acl/policy.go new file mode 100644 index 00000000000..ae69067fea9 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/acl/policy.go @@ -0,0 +1,135 @@ +package acl + +import ( + "fmt" + + "github.com/hashicorp/hcl" +) + +const ( + PolicyDeny = "deny" + PolicyRead = "read" + PolicyWrite = "write" +) + +// Policy is used to represent the policy specified by +// an ACL configuration. +type Policy struct { + ID string `hcl:"-"` + Keys []*KeyPolicy `hcl:"key,expand"` + Services []*ServicePolicy `hcl:"service,expand"` + Events []*EventPolicy `hcl:"event,expand"` + PreparedQueries []*PreparedQueryPolicy `hcl:"query,expand"` + Keyring string `hcl:"keyring"` + Operator string `hcl:"operator"` +} + +// KeyPolicy represents a policy for a key +type KeyPolicy struct { + Prefix string `hcl:",key"` + Policy string +} + +func (k *KeyPolicy) GoString() string { + return fmt.Sprintf("%#v", *k) +} + +// ServicePolicy represents a policy for a service +type ServicePolicy struct { + Name string `hcl:",key"` + Policy string +} + +func (k *ServicePolicy) GoString() string { + return fmt.Sprintf("%#v", *k) +} + +// EventPolicy represents a user event policy. +type EventPolicy struct { + Event string `hcl:",key"` + Policy string +} + +func (e *EventPolicy) GoString() string { + return fmt.Sprintf("%#v", *e) +} + +// PreparedQueryPolicy represents a prepared query policy. +type PreparedQueryPolicy struct { + Prefix string `hcl:",key"` + Policy string +} + +func (e *PreparedQueryPolicy) GoString() string { + return fmt.Sprintf("%#v", *e) +} + +// isPolicyValid makes sure the given string matches one of the valid policies. +func isPolicyValid(policy string) bool { + switch policy { + case PolicyDeny: + return true + case PolicyRead: + return true + case PolicyWrite: + return true + default: + return false + } +} + +// Parse is used to parse the specified ACL rules into an +// intermediary set of policies, before being compiled into +// the ACL +func Parse(rules string) (*Policy, error) { + // Decode the rules + p := &Policy{} + if rules == "" { + // Hot path for empty rules + return p, nil + } + + if err := hcl.Decode(p, rules); err != nil { + return nil, fmt.Errorf("Failed to parse ACL rules: %v", err) + } + + // Validate the key policy + for _, kp := range p.Keys { + if !isPolicyValid(kp.Policy) { + return nil, fmt.Errorf("Invalid key policy: %#v", kp) + } + } + + // Validate the service policy + for _, sp := range p.Services { + if !isPolicyValid(sp.Policy) { + return nil, fmt.Errorf("Invalid service policy: %#v", sp) + } + } + + // Validate the user event policies + for _, ep := range p.Events { + if !isPolicyValid(ep.Policy) { + return nil, fmt.Errorf("Invalid event policy: %#v", ep) + } + } + + // Validate the prepared query policies + for _, pq := range p.PreparedQueries { + if !isPolicyValid(pq.Policy) { + return nil, fmt.Errorf("Invalid query policy: %#v", pq) + } + } + + // Validate the keyring policy - this one is allowed to be empty + if p.Keyring != "" && !isPolicyValid(p.Keyring) { + return nil, fmt.Errorf("Invalid keyring policy: %#v", p.Keyring) + } + + // Validate the operator policy - this one is allowed to be empty + if p.Operator != "" && !isPolicyValid(p.Operator) { + return nil, fmt.Errorf("Invalid operator policy: %#v", p.Operator) + } + + return p, nil +} diff --git a/vendor/github.com/hashicorp/consul/api/agent.go b/vendor/github.com/hashicorp/consul/api/agent.go index 67855c67fbc..87a6c10016c 100644 --- a/vendor/github.com/hashicorp/consul/api/agent.go +++ b/vendor/github.com/hashicorp/consul/api/agent.go @@ -62,8 +62,7 @@ type AgentCheckRegistration struct { AgentServiceCheck } -// AgentServiceCheck is used to create an associated -// check for a service +// AgentServiceCheck is used to define a node or service level check type AgentServiceCheck struct { Script string `json:",omitempty"` DockerContainerID string `json:",omitempty"` @@ -74,6 +73,14 @@ type AgentServiceCheck struct { HTTP string `json:",omitempty"` TCP string `json:",omitempty"` Status string `json:",omitempty"` + + // In Consul 0.7 and later, checks that are associated with a service + // may also contain this optional DeregisterCriticalServiceAfter field, + // which is a timeout in the same Go time format as Interval and TTL. If + // a check is in the critical state for more than this configured value, + // then its associated service (and all of its associated checks) will + // automatically be deregistered. + DeregisterCriticalServiceAfter string `json:",omitempty"` } type AgentServiceChecks []*AgentServiceCheck @@ -198,27 +205,42 @@ func (a *Agent) ServiceDeregister(serviceID string) error { return nil } -// PassTTL is used to set a TTL check to the passing state +// PassTTL is used to set a TTL check to the passing state. +// +// DEPRECATION NOTICE: This interface is deprecated in favor of UpdateTTL(). +// The client interface will be removed in 0.8 or changed to use +// UpdateTTL()'s endpoint and the server endpoints will be removed in 0.9. func (a *Agent) PassTTL(checkID, note string) error { return a.updateTTL(checkID, note, "pass") } -// WarnTTL is used to set a TTL check to the warning state +// WarnTTL is used to set a TTL check to the warning state. +// +// DEPRECATION NOTICE: This interface is deprecated in favor of UpdateTTL(). +// The client interface will be removed in 0.8 or changed to use +// UpdateTTL()'s endpoint and the server endpoints will be removed in 0.9. func (a *Agent) WarnTTL(checkID, note string) error { return a.updateTTL(checkID, note, "warn") } -// FailTTL is used to set a TTL check to the failing state +// FailTTL is used to set a TTL check to the failing state. +// +// DEPRECATION NOTICE: This interface is deprecated in favor of UpdateTTL(). +// The client interface will be removed in 0.8 or changed to use +// UpdateTTL()'s endpoint and the server endpoints will be removed in 0.9. func (a *Agent) FailTTL(checkID, note string) error { return a.updateTTL(checkID, note, "fail") } // updateTTL is used to update the TTL of a check. This is the internal -// method that uses the old API that's present in Consul versions prior -// to 0.6.4. Since Consul didn't have an analogous "update" API before it -// seemed ok to break this (former) UpdateTTL in favor of the new UpdateTTL -// below, but keep the old Pass/Warn/Fail methods using the old API under the -// hood. +// method that uses the old API that's present in Consul versions prior to +// 0.6.4. Since Consul didn't have an analogous "update" API before it seemed +// ok to break this (former) UpdateTTL in favor of the new UpdateTTL below, +// but keep the old Pass/Warn/Fail methods using the old API under the hood. +// +// DEPRECATION NOTICE: This interface is deprecated in favor of UpdateTTL(). +// The client interface will be removed in 0.8 and the server endpoints will +// be removed in 0.9. func (a *Agent) updateTTL(checkID, note, status string) error { switch status { case "pass": @@ -240,8 +262,9 @@ func (a *Agent) updateTTL(checkID, note, status string) error { // checkUpdate is the payload for a PUT for a check update. type checkUpdate struct { - // Status us one of the structs.Health* states, "passing", "warning", or - // "critical". + // Status is one of the api.Health* states: HealthPassing + // ("passing"), HealthWarning ("warning"), or HealthCritical + // ("critical"). Status string // Output is the information to post to the UI for operators as the diff --git a/vendor/github.com/hashicorp/consul/api/api.go b/vendor/github.com/hashicorp/consul/api/api.go index 590b858e1fc..dd811fde4bf 100644 --- a/vendor/github.com/hashicorp/consul/api/api.go +++ b/vendor/github.com/hashicorp/consul/api/api.go @@ -80,6 +80,9 @@ type QueryMeta struct { // How long did the request take RequestTime time.Duration + + // Is address translation enabled for HTTP responses on this agent + AddressTranslationEnabled bool } // WriteMeta is used to return meta data about a write @@ -330,6 +333,7 @@ type request struct { url *url.URL params url.Values body io.Reader + header http.Header obj interface{} } @@ -355,7 +359,7 @@ func (r *request) setQueryOptions(q *QueryOptions) { r.params.Set("wait", durToMsec(q.WaitTime)) } if q.Token != "" { - r.params.Set("token", q.Token) + r.header.Set("X-Consul-Token", q.Token) } if q.Near != "" { r.params.Set("near", q.Near) @@ -399,7 +403,7 @@ func (r *request) setWriteOptions(q *WriteOptions) { r.params.Set("dc", q.Datacenter) } if q.Token != "" { - r.params.Set("token", q.Token) + r.header.Set("X-Consul-Token", q.Token) } } @@ -426,6 +430,7 @@ func (r *request) toHTTP() (*http.Request, error) { req.URL.Host = r.url.Host req.URL.Scheme = r.url.Scheme req.Host = r.url.Host + req.Header = r.header // Setup auth if r.config.HttpAuth != nil { @@ -446,6 +451,7 @@ func (c *Client) newRequest(method, path string) *request { Path: path, }, params: make(map[string][]string), + header: make(http.Header), } if c.config.Datacenter != "" { r.params.Set("dc", c.config.Datacenter) @@ -454,7 +460,7 @@ func (c *Client) newRequest(method, path string) *request { r.params.Set("wait", durToMsec(r.config.WaitTime)) } if c.config.Token != "" { - r.params.Set("token", r.config.Token) + r.header.Set("X-Consul-Token", r.config.Token) } return r } @@ -539,6 +545,15 @@ func parseQueryMeta(resp *http.Response, q *QueryMeta) error { default: q.KnownLeader = false } + + // Parse X-Consul-Translate-Addresses + switch header.Get("X-Consul-Translate-Addresses") { + case "true": + q.AddressTranslationEnabled = true + default: + q.AddressTranslationEnabled = false + } + return nil } diff --git a/vendor/github.com/hashicorp/consul/api/catalog.go b/vendor/github.com/hashicorp/consul/api/catalog.go index 52a00b3043f..337772ec0bf 100644 --- a/vendor/github.com/hashicorp/consul/api/catalog.go +++ b/vendor/github.com/hashicorp/consul/api/catalog.go @@ -1,13 +1,15 @@ package api type Node struct { - Node string - Address string + Node string + Address string + TaggedAddresses map[string]string } type CatalogService struct { Node string Address string + TaggedAddresses map[string]string ServiceID string ServiceName string ServiceAddress string @@ -22,11 +24,12 @@ type CatalogNode struct { } type CatalogRegistration struct { - Node string - Address string - Datacenter string - Service *AgentService - Check *AgentCheck + Node string + Address string + TaggedAddresses map[string]string + Datacenter string + Service *AgentService + Check *AgentCheck } type CatalogDeregistration struct { diff --git a/vendor/github.com/hashicorp/consul/api/health.go b/vendor/github.com/hashicorp/consul/api/health.go index 5bb403f554f..74da949c8d1 100644 --- a/vendor/github.com/hashicorp/consul/api/health.go +++ b/vendor/github.com/hashicorp/consul/api/health.go @@ -8,7 +8,6 @@ const ( // HealthAny is special, and is used as a wild card, // not as a specific state. HealthAny = "any" - HealthUnknown = "unknown" HealthPassing = "passing" HealthWarning = "warning" HealthCritical = "critical" @@ -122,7 +121,6 @@ func (h *Health) State(state string, q *QueryOptions) ([]*HealthCheck, *QueryMet case HealthWarning: case HealthCritical: case HealthPassing: - case HealthUnknown: default: return nil, nil, fmt.Errorf("Unsupported state: %v", state) } diff --git a/vendor/github.com/hashicorp/consul/api/kv.go b/vendor/github.com/hashicorp/consul/api/kv.go index 688b3a09d2f..3dac2583c12 100644 --- a/vendor/github.com/hashicorp/consul/api/kv.go +++ b/vendor/github.com/hashicorp/consul/api/kv.go @@ -23,6 +23,43 @@ type KVPair struct { // KVPairs is a list of KVPair objects type KVPairs []*KVPair +// KVOp constants give possible operations available in a KVTxn. +type KVOp string + +const ( + KVSet KVOp = "set" + KVDelete = "delete" + KVDeleteCAS = "delete-cas" + KVDeleteTree = "delete-tree" + KVCAS = "cas" + KVLock = "lock" + KVUnlock = "unlock" + KVGet = "get" + KVGetTree = "get-tree" + KVCheckSession = "check-session" + KVCheckIndex = "check-index" +) + +// KVTxnOp defines a single operation inside a transaction. +type KVTxnOp struct { + Verb string + Key string + Value []byte + Flags uint64 + Index uint64 + Session string +} + +// KVTxnOps defines a set of operations to be performed inside a single +// transaction. +type KVTxnOps []*KVTxnOp + +// KVTxnResponse has the outcome of a transaction. +type KVTxnResponse struct { + Results []*KVPair + Errors TxnErrors +} + // KV is used to manipulate the K/V API type KV struct { c *Client @@ -238,3 +275,122 @@ func (k *KV) deleteInternal(key string, params map[string]string, q *WriteOption res := strings.Contains(string(buf.Bytes()), "true") return res, qm, nil } + +// TxnOp is the internal format we send to Consul. It's not specific to KV, +// though currently only KV operations are supported. +type TxnOp struct { + KV *KVTxnOp +} + +// TxnOps is a list of transaction operations. +type TxnOps []*TxnOp + +// TxnResult is the internal format we receive from Consul. +type TxnResult struct { + KV *KVPair +} + +// TxnResults is a list of TxnResult objects. +type TxnResults []*TxnResult + +// TxnError is used to return information about an operation in a transaction. +type TxnError struct { + OpIndex int + What string +} + +// TxnErrors is a list of TxnError objects. +type TxnErrors []*TxnError + +// TxnResponse is the internal format we receive from Consul. +type TxnResponse struct { + Results TxnResults + Errors TxnErrors +} + +// Txn is used to apply multiple KV operations in a single, atomic transaction. +// +// Note that Go will perform the required base64 encoding on the values +// automatically because the type is a byte slice. Transactions are defined as a +// list of operations to perform, using the KVOp constants and KVTxnOp structure +// to define operations. If any operation fails, none of the changes are applied +// to the state store. Note that this hides the internal raw transaction interface +// and munges the input and output types into KV-specific ones for ease of use. +// If there are more non-KV operations in the future we may break out a new +// transaction API client, but it will be easy to keep this KV-specific variant +// supported. +// +// Even though this is generally a write operation, we take a QueryOptions input +// and return a QueryMeta output. If the transaction contains only read ops, then +// Consul will fast-path it to a different endpoint internally which supports +// consistency controls, but not blocking. If there are write operations then +// the request will always be routed through raft and any consistency settings +// will be ignored. +// +// Here's an example: +// +// ops := KVTxnOps{ +// &KVTxnOp{ +// Verb: KVLock, +// Key: "test/lock", +// Session: "adf4238a-882b-9ddc-4a9d-5b6758e4159e", +// Value: []byte("hello"), +// }, +// &KVTxnOp{ +// Verb: KVGet, +// Key: "another/key", +// }, +// } +// ok, response, _, err := kv.Txn(&ops, nil) +// +// If there is a problem making the transaction request then an error will be +// returned. Otherwise, the ok value will be true if the transaction succeeded +// or false if it was rolled back. The response is a structured return value which +// will have the outcome of the transaction. Its Results member will have entries +// for each operation. Deleted keys will have a nil entry in the, and to save +// space, the Value of each key in the Results will be nil unless the operation +// is a KVGet. If the transaction was rolled back, the Errors member will have +// entries referencing the index of the operation that failed along with an error +// message. +func (k *KV) Txn(txn KVTxnOps, q *QueryOptions) (bool, *KVTxnResponse, *QueryMeta, error) { + r := k.c.newRequest("PUT", "/v1/txn") + r.setQueryOptions(q) + + // Convert into the internal format since this is an all-KV txn. + ops := make(TxnOps, 0, len(txn)) + for _, kvOp := range txn { + ops = append(ops, &TxnOp{KV: kvOp}) + } + r.obj = ops + rtt, resp, err := k.c.doRequest(r) + if err != nil { + return false, nil, nil, err + } + defer resp.Body.Close() + + qm := &QueryMeta{} + parseQueryMeta(resp, qm) + qm.RequestTime = rtt + + if resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusConflict { + var txnResp TxnResponse + if err := decodeBody(resp, &txnResp); err != nil { + return false, nil, nil, err + } + + // Convert from the internal format. + kvResp := KVTxnResponse{ + Errors: txnResp.Errors, + } + for _, result := range txnResp.Results { + kvResp.Results = append(kvResp.Results, result.KV) + } + return resp.StatusCode == http.StatusOK, &kvResp, qm, nil + } + + var buf bytes.Buffer + if _, err := io.Copy(&buf, resp.Body); err != nil { + return false, nil, nil, fmt.Errorf("Failed to read response: %v", err) + } + return false, nil, nil, fmt.Errorf("Failed request: %s", buf.String()) +} diff --git a/vendor/github.com/hashicorp/consul/api/operator.go b/vendor/github.com/hashicorp/consul/api/operator.go new file mode 100644 index 00000000000..48d74f3ca6a --- /dev/null +++ b/vendor/github.com/hashicorp/consul/api/operator.go @@ -0,0 +1,81 @@ +package api + +// Operator can be used to perform low-level operator tasks for Consul. +type Operator struct { + c *Client +} + +// Operator returns a handle to the operator endpoints. +func (c *Client) Operator() *Operator { + return &Operator{c} +} + +// RaftServer has information about a server in the Raft configuration. +type RaftServer struct { + // ID is the unique ID for the server. These are currently the same + // as the address, but they will be changed to a real GUID in a future + // release of Consul. + ID string + + // Node is the node name of the server, as known by Consul, or this + // will be set to "(unknown)" otherwise. + Node string + + // Address is the IP:port of the server, used for Raft communications. + Address string + + // Leader is true if this server is the current cluster leader. + Leader bool + + // Voter is true if this server has a vote in the cluster. This might + // be false if the server is staging and still coming online, or if + // it's a non-voting server, which will be added in a future release of + // Consul. + Voter bool +} + +// RaftConfigration is returned when querying for the current Raft configuration. +type RaftConfiguration struct { + // Servers has the list of servers in the Raft configuration. + Servers []*RaftServer + + // Index has the Raft index of this configuration. + Index uint64 +} + +// RaftGetConfiguration is used to query the current Raft peer set. +func (op *Operator) RaftGetConfiguration(q *QueryOptions) (*RaftConfiguration, error) { + r := op.c.newRequest("GET", "/v1/operator/raft/configuration") + r.setQueryOptions(q) + _, resp, err := requireOK(op.c.doRequest(r)) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + var out RaftConfiguration + if err := decodeBody(resp, &out); err != nil { + return nil, err + } + return &out, nil +} + +// RaftRemovePeerByAddress is used to kick a stale peer (one that it in the Raft +// quorum but no longer known to Serf or the catalog) by address in the form of +// "IP:port". +func (op *Operator) RaftRemovePeerByAddress(address string, q *WriteOptions) error { + r := op.c.newRequest("DELETE", "/v1/operator/raft/peer") + r.setWriteOptions(q) + + // TODO (slackpad) Currently we made address a query parameter. Once + // IDs are in place this will be DELETE /v1/operator/raft/peer/. + r.params.Set("address", string(address)) + + _, resp, err := requireOK(op.c.doRequest(r)) + if err != nil { + return err + } + + resp.Body.Close() + return nil +} diff --git a/vendor/github.com/hashicorp/consul/api/prepared_query.go b/vendor/github.com/hashicorp/consul/api/prepared_query.go index c8141887c46..63e741e050d 100644 --- a/vendor/github.com/hashicorp/consul/api/prepared_query.go +++ b/vendor/github.com/hashicorp/consul/api/prepared_query.go @@ -25,6 +25,11 @@ type ServiceQuery struct { // Service is the service to query. Service string + // Near allows baking in the name of a node to automatically distance- + // sort from. The magic "_agent" value is supported, which sorts near + // the agent which initiated the request by default. + Near string + // Failover controls what we do if there are no healthy nodes in the // local datacenter. Failover QueryDatacenterOptions @@ -40,6 +45,17 @@ type ServiceQuery struct { Tags []string } +// QueryTemplate carries the arguments for creating a templated query. +type QueryTemplate struct { + // Type specifies the type of the query template. Currently only + // "name_prefix_match" is supported. This field is required. + Type string + + // Regexp allows specifying a regex pattern to match against the name + // of the query being executed. + Regexp string +} + // PrepatedQueryDefinition defines a complete prepared query. type PreparedQueryDefinition struct { // ID is this UUID-based ID for the query, always generated by Consul. @@ -67,6 +83,11 @@ type PreparedQueryDefinition struct { // DNS has options that control how the results of this query are // served over DNS. DNS QueryDNSOptions + + // Template is used to pass through the arguments for creating a + // prepared query with an attached template. If a template is given, + // interpolations are possible in other struct fields. + Template QueryTemplate } // PreparedQueryExecuteResponse has the results of executing a query. diff --git a/vendor/github.com/hashicorp/consul/consul/structs/operator.go b/vendor/github.com/hashicorp/consul/consul/structs/operator.go new file mode 100644 index 00000000000..d564400bf97 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/consul/structs/operator.go @@ -0,0 +1,57 @@ +package structs + +import ( + "github.com/hashicorp/raft" +) + +// RaftServer has information about a server in the Raft configuration. +type RaftServer struct { + // ID is the unique ID for the server. These are currently the same + // as the address, but they will be changed to a real GUID in a future + // release of Consul. + ID raft.ServerID + + // Node is the node name of the server, as known by Consul, or this + // will be set to "(unknown)" otherwise. + Node string + + // Address is the IP:port of the server, used for Raft communications. + Address raft.ServerAddress + + // Leader is true if this server is the current cluster leader. + Leader bool + + // Voter is true if this server has a vote in the cluster. This might + // be false if the server is staging and still coming online, or if + // it's a non-voting server, which will be added in a future release of + // Consul. + Voter bool +} + +// RaftConfigrationResponse is returned when querying for the current Raft +// configuration. +type RaftConfigurationResponse struct { + // Servers has the list of servers in the Raft configuration. + Servers []*RaftServer + + // Index has the Raft index of this configuration. + Index uint64 +} + +// RaftPeerByAddressRequest is used by the Operator endpoint to apply a Raft +// operation on a specific Raft peer by address in the form of "IP:port". +type RaftPeerByAddressRequest struct { + // Datacenter is the target this request is intended for. + Datacenter string + + // Address is the peer to remove, in the form "IP:port". + Address raft.ServerAddress + + // WriteRequest holds the ACL token to go along with this request. + WriteRequest +} + +// RequestDatacenter returns the datacenter for a given request. +func (op *RaftPeerByAddressRequest) RequestDatacenter() string { + return op.Datacenter +} diff --git a/vendor/github.com/hashicorp/consul/consul/structs/prepared_query.go b/vendor/github.com/hashicorp/consul/consul/structs/prepared_query.go new file mode 100644 index 00000000000..5e9c31847ba --- /dev/null +++ b/vendor/github.com/hashicorp/consul/consul/structs/prepared_query.go @@ -0,0 +1,252 @@ +package structs + +// QueryDatacenterOptions sets options about how we fail over if there are no +// healthy nodes in the local datacenter. +type QueryDatacenterOptions struct { + // NearestN is set to the number of remote datacenters to try, based on + // network coordinates. + NearestN int + + // Datacenters is a fixed list of datacenters to try after NearestN. We + // never try a datacenter multiple times, so those are subtracted from + // this list before proceeding. + Datacenters []string +} + +// QueryDNSOptions controls settings when query results are served over DNS. +type QueryDNSOptions struct { + // TTL is the time to live for the served DNS results. + TTL string +} + +// ServiceQuery is used to query for a set of healthy nodes offering a specific +// service. +type ServiceQuery struct { + // Service is the service to query. + Service string + + // Failover controls what we do if there are no healthy nodes in the + // local datacenter. + Failover QueryDatacenterOptions + + // If OnlyPassing is true then we will only include nodes with passing + // health checks (critical AND warning checks will cause a node to be + // discarded) + OnlyPassing bool + + // Near allows the query to always prefer the node nearest the given + // node. If the node does not exist, results are returned in their + // normal randomly-shuffled order. Supplying the magic "_agent" value + // is supported to sort near the agent which initiated the request. + Near string + + // Tags are a set of required and/or disallowed tags. If a tag is in + // this list it must be present. If the tag is preceded with "!" then + // it is disallowed. + Tags []string +} + +const ( + // QueryTemplateTypeNamePrefixMatch uses the Name field of the query as + // a prefix to select the template. + QueryTemplateTypeNamePrefixMatch = "name_prefix_match" +) + +// QueryTemplateOptions controls settings if this query is a template. +type QueryTemplateOptions struct { + // Type, if non-empty, means that this query is a template. This is + // set to one of the QueryTemplateType* constants above. + Type string + + // Regexp is an optional regular expression to use to parse the full + // name, once the prefix match has selected a template. This can be + // used to extract parts of the name and choose a service name, set + // tags, etc. + Regexp string +} + +// PreparedQuery defines a complete prepared query, and is the structure we +// maintain in the state store. +type PreparedQuery struct { + // ID is this UUID-based ID for the query, always generated by Consul. + ID string + + // Name is an optional friendly name for the query supplied by the + // user. NOTE - if this feature is used then it will reduce the security + // of any read ACL associated with this query/service since this name + // can be used to locate nodes with supplying any ACL. + Name string + + // Session is an optional session to tie this query's lifetime to. If + // this is omitted then the query will not expire. + Session string + + // Token is the ACL token used when the query was created, and it is + // used when a query is subsequently executed. This token, or a token + // with management privileges, must be used to change the query later. + Token string + + // Template is used to configure this query as a template, which will + // respond to queries based on the Name, and then will be rendered + // before it is executed. + Template QueryTemplateOptions + + // Service defines a service query (leaving things open for other types + // later). + Service ServiceQuery + + // DNS has options that control how the results of this query are + // served over DNS. + DNS QueryDNSOptions + + RaftIndex +} + +// GetACLPrefix returns the prefix to look up the prepared_query ACL policy for +// this query, and whether the prefix applies to this query. You always need to +// check the ok value before using the prefix. +func (pq *PreparedQuery) GetACLPrefix() (string, bool) { + if pq.Name != "" || pq.Template.Type != "" { + return pq.Name, true + } + + return "", false +} + +type PreparedQueries []*PreparedQuery + +type IndexedPreparedQueries struct { + Queries PreparedQueries + QueryMeta +} + +type PreparedQueryOp string + +const ( + PreparedQueryCreate PreparedQueryOp = "create" + PreparedQueryUpdate PreparedQueryOp = "update" + PreparedQueryDelete PreparedQueryOp = "delete" +) + +// QueryRequest is used to create or change prepared queries. +type PreparedQueryRequest struct { + // Datacenter is the target this request is intended for. + Datacenter string + + // Op is the operation to apply. + Op PreparedQueryOp + + // Query is the query itself. + Query *PreparedQuery + + // WriteRequest holds the ACL token to go along with this request. + WriteRequest +} + +// RequestDatacenter returns the datacenter for a given request. +func (q *PreparedQueryRequest) RequestDatacenter() string { + return q.Datacenter +} + +// PreparedQuerySpecificRequest is used to get information about a prepared +// query. +type PreparedQuerySpecificRequest struct { + // Datacenter is the target this request is intended for. + Datacenter string + + // QueryID is the ID of a query. + QueryID string + + // QueryOptions (unfortunately named here) controls the consistency + // settings for the query lookup itself, as well as the service lookups. + QueryOptions +} + +// RequestDatacenter returns the datacenter for a given request. +func (q *PreparedQuerySpecificRequest) RequestDatacenter() string { + return q.Datacenter +} + +// PreparedQueryExecuteRequest is used to execute a prepared query. +type PreparedQueryExecuteRequest struct { + // Datacenter is the target this request is intended for. + Datacenter string + + // QueryIDOrName is the ID of a query _or_ the name of one, either can + // be provided. + QueryIDOrName string + + // Limit will trim the resulting list down to the given limit. + Limit int + + // Source is used to sort the results relative to a given node using + // network coordinates. + Source QuerySource + + // Agent is used to carry around a reference to the agent which initiated + // the execute request. Used to distance-sort relative to the local node. + Agent QuerySource + + // QueryOptions (unfortunately named here) controls the consistency + // settings for the query lookup itself, as well as the service lookups. + QueryOptions +} + +// RequestDatacenter returns the datacenter for a given request. +func (q *PreparedQueryExecuteRequest) RequestDatacenter() string { + return q.Datacenter +} + +// PreparedQueryExecuteRemoteRequest is used when running a local query in a +// remote datacenter. +type PreparedQueryExecuteRemoteRequest struct { + // Datacenter is the target this request is intended for. + Datacenter string + + // Query is a copy of the query to execute. We have to ship the entire + // query over since it won't be present in the remote state store. + Query PreparedQuery + + // Limit will trim the resulting list down to the given limit. + Limit int + + // QueryOptions (unfortunately named here) controls the consistency + // settings for the the service lookups. + QueryOptions +} + +// RequestDatacenter returns the datacenter for a given request. +func (q *PreparedQueryExecuteRemoteRequest) RequestDatacenter() string { + return q.Datacenter +} + +// PreparedQueryExecuteResponse has the results of executing a query. +type PreparedQueryExecuteResponse struct { + // Service is the service that was queried. + Service string + + // Nodes has the nodes that were output by the query. + Nodes CheckServiceNodes + + // DNS has the options for serving these results over DNS. + DNS QueryDNSOptions + + // Datacenter is the datacenter that these results came from. + Datacenter string + + // Failovers is a count of how many times we had to query a remote + // datacenter. + Failovers int + + // QueryMeta has freshness information about the query. + QueryMeta +} + +// PreparedQueryExplainResponse has the results when explaining a query/ +type PreparedQueryExplainResponse struct { + // Query has the fully-rendered query. + Query PreparedQuery + + // QueryMeta has freshness information about the query. + QueryMeta +} diff --git a/vendor/github.com/hashicorp/consul/consul/structs/structs.go b/vendor/github.com/hashicorp/consul/consul/structs/structs.go new file mode 100644 index 00000000000..837d34a8bd4 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/consul/structs/structs.go @@ -0,0 +1,930 @@ +package structs + +import ( + "bytes" + "fmt" + "math/rand" + "reflect" + "time" + + "github.com/hashicorp/consul/acl" + "github.com/hashicorp/consul/types" + "github.com/hashicorp/go-msgpack/codec" + "github.com/hashicorp/serf/coordinate" +) + +var ( + ErrNoLeader = fmt.Errorf("No cluster leader") + ErrNoDCPath = fmt.Errorf("No path to datacenter") + ErrNoServers = fmt.Errorf("No known Consul servers") +) + +type MessageType uint8 + +// RaftIndex is used to track the index used while creating +// or modifying a given struct type. +type RaftIndex struct { + CreateIndex uint64 + ModifyIndex uint64 +} + +const ( + RegisterRequestType MessageType = iota + DeregisterRequestType + KVSRequestType + SessionRequestType + ACLRequestType + TombstoneRequestType + CoordinateBatchUpdateType + PreparedQueryRequestType + TxnRequestType +) + +const ( + // IgnoreUnknownTypeFlag is set along with a MessageType + // to indicate that the message type can be safely ignored + // if it is not recognized. This is for future proofing, so + // that new commands can be added in a way that won't cause + // old servers to crash when the FSM attempts to process them. + IgnoreUnknownTypeFlag MessageType = 128 +) + +const ( + // HealthAny is special, and is used as a wild card, + // not as a specific state. + HealthAny = "any" + HealthPassing = "passing" + HealthWarning = "warning" + HealthCritical = "critical" +) + +func ValidStatus(s string) bool { + return s == HealthPassing || + s == HealthWarning || + s == HealthCritical +} + +const ( + // Client tokens have rules applied + ACLTypeClient = "client" + + // Management tokens have an always allow policy. + // They are used for token management. + ACLTypeManagement = "management" +) + +const ( + // MaxLockDelay provides a maximum LockDelay value for + // a session. Any value above this will not be respected. + MaxLockDelay = 60 * time.Second +) + +// RPCInfo is used to describe common information about query +type RPCInfo interface { + RequestDatacenter() string + IsRead() bool + AllowStaleRead() bool + ACLToken() string +} + +// QueryOptions is used to specify various flags for read queries +type QueryOptions struct { + // Token is the ACL token ID. If not provided, the 'anonymous' + // token is assumed for backwards compatibility. + Token string + + // If set, wait until query exceeds given index. Must be provided + // with MaxQueryTime. + MinQueryIndex uint64 + + // Provided with MinQueryIndex to wait for change. + MaxQueryTime time.Duration + + // If set, any follower can service the request. Results + // may be arbitrarily stale. + AllowStale bool + + // If set, the leader must verify leadership prior to + // servicing the request. Prevents a stale read. + RequireConsistent bool +} + +// QueryOption only applies to reads, so always true +func (q QueryOptions) IsRead() bool { + return true +} + +func (q QueryOptions) AllowStaleRead() bool { + return q.AllowStale +} + +func (q QueryOptions) ACLToken() string { + return q.Token +} + +type WriteRequest struct { + // Token is the ACL token ID. If not provided, the 'anonymous' + // token is assumed for backwards compatibility. + Token string +} + +// WriteRequest only applies to writes, always false +func (w WriteRequest) IsRead() bool { + return false +} + +func (w WriteRequest) AllowStaleRead() bool { + return false +} + +func (w WriteRequest) ACLToken() string { + return w.Token +} + +// QueryMeta allows a query response to include potentially +// useful metadata about a query +type QueryMeta struct { + // This is the index associated with the read + Index uint64 + + // If AllowStale is used, this is time elapsed since + // last contact between the follower and leader. This + // can be used to gauge staleness. + LastContact time.Duration + + // Used to indicate if there is a known leader node + KnownLeader bool +} + +// RegisterRequest is used for the Catalog.Register endpoint +// to register a node as providing a service. If no service +// is provided, the node is registered. +type RegisterRequest struct { + Datacenter string + Node string + Address string + TaggedAddresses map[string]string + Service *NodeService + Check *HealthCheck + Checks HealthChecks + WriteRequest +} + +func (r *RegisterRequest) RequestDatacenter() string { + return r.Datacenter +} + +// DeregisterRequest is used for the Catalog.Deregister endpoint +// to deregister a node as providing a service. If no service is +// provided the entire node is deregistered. +type DeregisterRequest struct { + Datacenter string + Node string + ServiceID string + CheckID types.CheckID + WriteRequest +} + +func (r *DeregisterRequest) RequestDatacenter() string { + return r.Datacenter +} + +// QuerySource is used to pass along information about the source node +// in queries so that we can adjust the response based on its network +// coordinates. +type QuerySource struct { + Datacenter string + Node string +} + +// DCSpecificRequest is used to query about a specific DC +type DCSpecificRequest struct { + Datacenter string + Source QuerySource + QueryOptions +} + +func (r *DCSpecificRequest) RequestDatacenter() string { + return r.Datacenter +} + +// ServiceSpecificRequest is used to query about a specific service +type ServiceSpecificRequest struct { + Datacenter string + ServiceName string + ServiceTag string + TagFilter bool // Controls tag filtering + Source QuerySource + QueryOptions +} + +func (r *ServiceSpecificRequest) RequestDatacenter() string { + return r.Datacenter +} + +// NodeSpecificRequest is used to request the information about a single node +type NodeSpecificRequest struct { + Datacenter string + Node string + QueryOptions +} + +func (r *NodeSpecificRequest) RequestDatacenter() string { + return r.Datacenter +} + +// ChecksInStateRequest is used to query for nodes in a state +type ChecksInStateRequest struct { + Datacenter string + State string + Source QuerySource + QueryOptions +} + +func (r *ChecksInStateRequest) RequestDatacenter() string { + return r.Datacenter +} + +// Used to return information about a node +type Node struct { + Node string + Address string + TaggedAddresses map[string]string + + RaftIndex +} +type Nodes []*Node + +// Used to return information about a provided services. +// Maps service name to available tags +type Services map[string][]string + +// ServiceNode represents a node that is part of a service. Address and +// TaggedAddresses are node-related fields that are always empty in the state +// store and are filled in on the way out by parseServiceNodes(). This is also +// why PartialClone() skips them, because we know they are blank already so it +// would be a waste of time to copy them. +type ServiceNode struct { + Node string + Address string + TaggedAddresses map[string]string + ServiceID string + ServiceName string + ServiceTags []string + ServiceAddress string + ServicePort int + ServiceEnableTagOverride bool + + RaftIndex +} + +// PartialClone() returns a clone of the given service node, minus the node- +// related fields that get filled in later, Address and TaggedAddresses. +func (s *ServiceNode) PartialClone() *ServiceNode { + tags := make([]string, len(s.ServiceTags)) + copy(tags, s.ServiceTags) + + return &ServiceNode{ + Node: s.Node, + // Skip Address, see above. + // Skip TaggedAddresses, see above. + ServiceID: s.ServiceID, + ServiceName: s.ServiceName, + ServiceTags: tags, + ServiceAddress: s.ServiceAddress, + ServicePort: s.ServicePort, + ServiceEnableTagOverride: s.ServiceEnableTagOverride, + RaftIndex: RaftIndex{ + CreateIndex: s.CreateIndex, + ModifyIndex: s.ModifyIndex, + }, + } +} + +// ToNodeService converts the given service node to a node service. +func (s *ServiceNode) ToNodeService() *NodeService { + return &NodeService{ + ID: s.ServiceID, + Service: s.ServiceName, + Tags: s.ServiceTags, + Address: s.ServiceAddress, + Port: s.ServicePort, + EnableTagOverride: s.ServiceEnableTagOverride, + RaftIndex: RaftIndex{ + CreateIndex: s.CreateIndex, + ModifyIndex: s.ModifyIndex, + }, + } +} + +type ServiceNodes []*ServiceNode + +// NodeService is a service provided by a node +type NodeService struct { + ID string + Service string + Tags []string + Address string + Port int + EnableTagOverride bool + + RaftIndex +} + +// IsSame checks if one NodeService is the same as another, without looking +// at the Raft information (that's why we didn't call it IsEqual). This is +// useful for seeing if an update would be idempotent for all the functional +// parts of the structure. +func (s *NodeService) IsSame(other *NodeService) bool { + if s.ID != other.ID || + s.Service != other.Service || + !reflect.DeepEqual(s.Tags, other.Tags) || + s.Address != other.Address || + s.Port != other.Port || + s.EnableTagOverride != other.EnableTagOverride { + return false + } + + return true +} + +// ToServiceNode converts the given node service to a service node. +func (s *NodeService) ToServiceNode(node string) *ServiceNode { + return &ServiceNode{ + Node: node, + // Skip Address, see ServiceNode definition. + // Skip TaggedAddresses, see ServiceNode definition. + ServiceID: s.ID, + ServiceName: s.Service, + ServiceTags: s.Tags, + ServiceAddress: s.Address, + ServicePort: s.Port, + ServiceEnableTagOverride: s.EnableTagOverride, + RaftIndex: RaftIndex{ + CreateIndex: s.CreateIndex, + ModifyIndex: s.ModifyIndex, + }, + } +} + +type NodeServices struct { + Node *Node + Services map[string]*NodeService +} + +// HealthCheck represents a single check on a given node +type HealthCheck struct { + Node string + CheckID types.CheckID // Unique per-node ID + Name string // Check name + Status string // The current check status + Notes string // Additional notes with the status + Output string // Holds output of script runs + ServiceID string // optional associated service + ServiceName string // optional service name + + RaftIndex +} + +// IsSame checks if one HealthCheck is the same as another, without looking +// at the Raft information (that's why we didn't call it IsEqual). This is +// useful for seeing if an update would be idempotent for all the functional +// parts of the structure. +func (c *HealthCheck) IsSame(other *HealthCheck) bool { + if c.Node != other.Node || + c.CheckID != other.CheckID || + c.Name != other.Name || + c.Status != other.Status || + c.Notes != other.Notes || + c.Output != other.Output || + c.ServiceID != other.ServiceID || + c.ServiceName != other.ServiceName { + return false + } + + return true +} + +// Clone returns a distinct clone of the HealthCheck. +func (c *HealthCheck) Clone() *HealthCheck { + clone := new(HealthCheck) + *clone = *c + return clone +} + +type HealthChecks []*HealthCheck + +// CheckServiceNode is used to provide the node, its service +// definition, as well as a HealthCheck that is associated. +type CheckServiceNode struct { + Node *Node + Service *NodeService + Checks HealthChecks +} +type CheckServiceNodes []CheckServiceNode + +// Shuffle does an in-place random shuffle using the Fisher-Yates algorithm. +func (nodes CheckServiceNodes) Shuffle() { + for i := len(nodes) - 1; i > 0; i-- { + j := rand.Int31n(int32(i + 1)) + nodes[i], nodes[j] = nodes[j], nodes[i] + } +} + +// Filter removes nodes that are failing health checks (and any non-passing +// check if that option is selected). Note that this returns the filtered +// results AND modifies the receiver for performance. +func (nodes CheckServiceNodes) Filter(onlyPassing bool) CheckServiceNodes { + n := len(nodes) +OUTER: + for i := 0; i < n; i++ { + node := nodes[i] + for _, check := range node.Checks { + if check.Status == HealthCritical || + (onlyPassing && check.Status != HealthPassing) { + nodes[i], nodes[n-1] = nodes[n-1], CheckServiceNode{} + n-- + i-- + continue OUTER + } + } + } + return nodes[:n] +} + +// NodeInfo is used to dump all associated information about +// a node. This is currently used for the UI only, as it is +// rather expensive to generate. +type NodeInfo struct { + Node string + Address string + TaggedAddresses map[string]string + Services []*NodeService + Checks []*HealthCheck +} + +// NodeDump is used to dump all the nodes with all their +// associated data. This is currently used for the UI only, +// as it is rather expensive to generate. +type NodeDump []*NodeInfo + +type IndexedNodes struct { + Nodes Nodes + QueryMeta +} + +type IndexedServices struct { + Services Services + QueryMeta +} + +type IndexedServiceNodes struct { + ServiceNodes ServiceNodes + QueryMeta +} + +type IndexedNodeServices struct { + NodeServices *NodeServices + QueryMeta +} + +type IndexedHealthChecks struct { + HealthChecks HealthChecks + QueryMeta +} + +type IndexedCheckServiceNodes struct { + Nodes CheckServiceNodes + QueryMeta +} + +type IndexedNodeDump struct { + Dump NodeDump + QueryMeta +} + +// DirEntry is used to represent a directory entry. This is +// used for values in our Key-Value store. +type DirEntry struct { + LockIndex uint64 + Key string + Flags uint64 + Value []byte + Session string `json:",omitempty"` + + RaftIndex +} + +// Returns a clone of the given directory entry. +func (d *DirEntry) Clone() *DirEntry { + return &DirEntry{ + LockIndex: d.LockIndex, + Key: d.Key, + Flags: d.Flags, + Value: d.Value, + Session: d.Session, + RaftIndex: RaftIndex{ + CreateIndex: d.CreateIndex, + ModifyIndex: d.ModifyIndex, + }, + } +} + +type DirEntries []*DirEntry + +type KVSOp string + +const ( + KVSSet KVSOp = "set" + KVSDelete = "delete" + KVSDeleteCAS = "delete-cas" // Delete with check-and-set + KVSDeleteTree = "delete-tree" + KVSCAS = "cas" // Check-and-set + KVSLock = "lock" // Lock a key + KVSUnlock = "unlock" // Unlock a key + + // The following operations are only available inside of atomic + // transactions via the Txn request. + KVSGet = "get" // Read the key during the transaction. + KVSGetTree = "get-tree" // Read all keys with the given prefix during the transaction. + KVSCheckSession = "check-session" // Check the session holds the key. + KVSCheckIndex = "check-index" // Check the modify index of the key. +) + +// IsWrite returns true if the given operation alters the state store. +func (op KVSOp) IsWrite() bool { + switch op { + case KVSGet, KVSGetTree, KVSCheckSession, KVSCheckIndex: + return false + + default: + return true + } +} + +// KVSRequest is used to operate on the Key-Value store +type KVSRequest struct { + Datacenter string + Op KVSOp // Which operation are we performing + DirEnt DirEntry // Which directory entry + WriteRequest +} + +func (r *KVSRequest) RequestDatacenter() string { + return r.Datacenter +} + +// KeyRequest is used to request a key, or key prefix +type KeyRequest struct { + Datacenter string + Key string + QueryOptions +} + +func (r *KeyRequest) RequestDatacenter() string { + return r.Datacenter +} + +// KeyListRequest is used to list keys +type KeyListRequest struct { + Datacenter string + Prefix string + Seperator string + QueryOptions +} + +func (r *KeyListRequest) RequestDatacenter() string { + return r.Datacenter +} + +type IndexedDirEntries struct { + Entries DirEntries + QueryMeta +} + +type IndexedKeyList struct { + Keys []string + QueryMeta +} + +type SessionBehavior string + +const ( + SessionKeysRelease SessionBehavior = "release" + SessionKeysDelete = "delete" +) + +const ( + SessionTTLMax = 24 * time.Hour + SessionTTLMultiplier = 2 +) + +// Session is used to represent an open session in the KV store. +// This issued to associate node checks with acquired locks. +type Session struct { + ID string + Name string + Node string + Checks []types.CheckID + LockDelay time.Duration + Behavior SessionBehavior // What to do when session is invalidated + TTL string + + RaftIndex +} +type Sessions []*Session + +type SessionOp string + +const ( + SessionCreate SessionOp = "create" + SessionDestroy = "destroy" +) + +// SessionRequest is used to operate on sessions +type SessionRequest struct { + Datacenter string + Op SessionOp // Which operation are we performing + Session Session // Which session + WriteRequest +} + +func (r *SessionRequest) RequestDatacenter() string { + return r.Datacenter +} + +// SessionSpecificRequest is used to request a session by ID +type SessionSpecificRequest struct { + Datacenter string + Session string + QueryOptions +} + +func (r *SessionSpecificRequest) RequestDatacenter() string { + return r.Datacenter +} + +type IndexedSessions struct { + Sessions Sessions + QueryMeta +} + +// ACL is used to represent a token and its rules +type ACL struct { + ID string + Name string + Type string + Rules string + + RaftIndex +} +type ACLs []*ACL + +type ACLOp string + +const ( + ACLSet ACLOp = "set" + ACLForceSet = "force-set" // Deprecated, left to backwards compatibility + ACLDelete = "delete" +) + +// IsSame checks if one ACL is the same as another, without looking +// at the Raft information (that's why we didn't call it IsEqual). This is +// useful for seeing if an update would be idempotent for all the functional +// parts of the structure. +func (a *ACL) IsSame(other *ACL) bool { + if a.ID != other.ID || + a.Name != other.Name || + a.Type != other.Type || + a.Rules != other.Rules { + return false + } + + return true +} + +// ACLRequest is used to create, update or delete an ACL +type ACLRequest struct { + Datacenter string + Op ACLOp + ACL ACL + WriteRequest +} + +func (r *ACLRequest) RequestDatacenter() string { + return r.Datacenter +} + +// ACLRequests is a list of ACL change requests. +type ACLRequests []*ACLRequest + +// ACLSpecificRequest is used to request an ACL by ID +type ACLSpecificRequest struct { + Datacenter string + ACL string + QueryOptions +} + +func (r *ACLSpecificRequest) RequestDatacenter() string { + return r.Datacenter +} + +// ACLPolicyRequest is used to request an ACL by ID, conditionally +// filtering on an ID +type ACLPolicyRequest struct { + Datacenter string + ACL string + ETag string + QueryOptions +} + +func (r *ACLPolicyRequest) RequestDatacenter() string { + return r.Datacenter +} + +type IndexedACLs struct { + ACLs ACLs + QueryMeta +} + +type ACLPolicy struct { + ETag string + Parent string + Policy *acl.Policy + TTL time.Duration + QueryMeta +} + +// ACLReplicationStatus provides information about the health of the ACL +// replication system. +type ACLReplicationStatus struct { + Enabled bool + Running bool + SourceDatacenter string + ReplicatedIndex uint64 + LastSuccess time.Time + LastError time.Time +} + +// Coordinate stores a node name with its associated network coordinate. +type Coordinate struct { + Node string + Coord *coordinate.Coordinate +} + +type Coordinates []*Coordinate + +// IndexedCoordinate is used to represent a single node's coordinate from the state +// store. +type IndexedCoordinate struct { + Coord *coordinate.Coordinate + QueryMeta +} + +// IndexedCoordinates is used to represent a list of nodes and their +// corresponding raw coordinates. +type IndexedCoordinates struct { + Coordinates Coordinates + QueryMeta +} + +// DatacenterMap is used to represent a list of nodes with their raw coordinates, +// associated with a datacenter. +type DatacenterMap struct { + Datacenter string + Coordinates Coordinates +} + +// CoordinateUpdateRequest is used to update the network coordinate of a given +// node. +type CoordinateUpdateRequest struct { + Datacenter string + Node string + Coord *coordinate.Coordinate + WriteRequest +} + +// RequestDatacenter returns the datacenter for a given update request. +func (c *CoordinateUpdateRequest) RequestDatacenter() string { + return c.Datacenter +} + +// EventFireRequest is used to ask a server to fire +// a Serf event. It is a bit odd, since it doesn't depend on +// the catalog or leader. Any node can respond, so it's not quite +// like a standard write request. This is used only internally. +type EventFireRequest struct { + Datacenter string + Name string + Payload []byte + + // Not using WriteRequest so that any server can process + // the request. It is a bit unusual... + QueryOptions +} + +func (r *EventFireRequest) RequestDatacenter() string { + return r.Datacenter +} + +// EventFireResponse is used to respond to a fire request. +type EventFireResponse struct { + QueryMeta +} + +type TombstoneOp string + +const ( + TombstoneReap TombstoneOp = "reap" +) + +// TombstoneRequest is used to trigger a reaping of the tombstones +type TombstoneRequest struct { + Datacenter string + Op TombstoneOp + ReapIndex uint64 + WriteRequest +} + +func (r *TombstoneRequest) RequestDatacenter() string { + return r.Datacenter +} + +// msgpackHandle is a shared handle for encoding/decoding of structs +var msgpackHandle = &codec.MsgpackHandle{} + +// Decode is used to decode a MsgPack encoded object +func Decode(buf []byte, out interface{}) error { + return codec.NewDecoder(bytes.NewReader(buf), msgpackHandle).Decode(out) +} + +// Encode is used to encode a MsgPack object with type prefix +func Encode(t MessageType, msg interface{}) ([]byte, error) { + var buf bytes.Buffer + buf.WriteByte(uint8(t)) + err := codec.NewEncoder(&buf, msgpackHandle).Encode(msg) + return buf.Bytes(), err +} + +// CompoundResponse is an interface for gathering multiple responses. It is +// used in cross-datacenter RPC calls where more than 1 datacenter is +// expected to reply. +type CompoundResponse interface { + // Add adds a new response to the compound response + Add(interface{}) + + // New returns an empty response object which can be passed around by + // reference, and then passed to Add() later on. + New() interface{} +} + +type KeyringOp string + +const ( + KeyringList KeyringOp = "list" + KeyringInstall = "install" + KeyringUse = "use" + KeyringRemove = "remove" +) + +// KeyringRequest encapsulates a request to modify an encryption keyring. +// It can be used for install, remove, or use key type operations. +type KeyringRequest struct { + Operation KeyringOp + Key string + Datacenter string + Forwarded bool + QueryOptions +} + +func (r *KeyringRequest) RequestDatacenter() string { + return r.Datacenter +} + +// KeyringResponse is a unified key response and can be used for install, +// remove, use, as well as listing key queries. +type KeyringResponse struct { + WAN bool + Datacenter string + Messages map[string]string + Keys map[string]int + NumNodes int + Error string +} + +// KeyringResponses holds multiple responses to keyring queries. Each +// datacenter replies independently, and KeyringResponses is used as a +// container for the set of all responses. +type KeyringResponses struct { + Responses []*KeyringResponse + QueryMeta +} + +func (r *KeyringResponses) Add(v interface{}) { + val := v.(*KeyringResponses) + r.Responses = append(r.Responses, val.Responses...) +} + +func (r *KeyringResponses) New() interface{} { + return new(KeyringResponses) +} diff --git a/vendor/github.com/hashicorp/consul/consul/structs/txn.go b/vendor/github.com/hashicorp/consul/consul/structs/txn.go new file mode 100644 index 00000000000..3f8035b97ef --- /dev/null +++ b/vendor/github.com/hashicorp/consul/consul/structs/txn.go @@ -0,0 +1,85 @@ +package structs + +import ( + "fmt" +) + +// TxnKVOp is used to define a single operation on the KVS inside a +// transaction +type TxnKVOp struct { + Verb KVSOp + DirEnt DirEntry +} + +// TxnKVResult is used to define the result of a single operation on the KVS +// inside a transaction. +type TxnKVResult *DirEntry + +// TxnOp is used to define a single operation inside a transaction. Only one +// of the types should be filled out per entry. +type TxnOp struct { + KV *TxnKVOp +} + +// TxnOps is a list of operations within a transaction. +type TxnOps []*TxnOp + +// TxnRequest is used to apply multiple operations to the state store in a +// single transaction +type TxnRequest struct { + Datacenter string + Ops TxnOps + WriteRequest +} + +func (r *TxnRequest) RequestDatacenter() string { + return r.Datacenter +} + +// TxnReadRequest is used as a fast path for read-only transactions that don't +// modify the state store. +type TxnReadRequest struct { + Datacenter string + Ops TxnOps + QueryOptions +} + +func (r *TxnReadRequest) RequestDatacenter() string { + return r.Datacenter +} + +// TxnError is used to return information about an error for a specific +// operation. +type TxnError struct { + OpIndex int + What string +} + +// Error returns the string representation of an atomic error. +func (e TxnError) Error() string { + return fmt.Sprintf("op %d: %s", e.OpIndex, e.What) +} + +// TxnErrors is a list of TxnError entries. +type TxnErrors []*TxnError + +// TxnResult is used to define the result of a given operation inside a +// transaction. Only one of the types should be filled out per entry. +type TxnResult struct { + KV TxnKVResult +} + +// TxnResults is a list of TxnResult entries. +type TxnResults []*TxnResult + +// TxnResponse is the structure returned by a TxnRequest. +type TxnResponse struct { + Results TxnResults + Errors TxnErrors +} + +// TxnReadResponse is the structure returned by a TxnReadRequest. +type TxnReadResponse struct { + TxnResponse + QueryMeta +} diff --git a/vendor/github.com/hashicorp/consul/testutil/README.md b/vendor/github.com/hashicorp/consul/testutil/README.md new file mode 100644 index 00000000000..21eb01d2a7f --- /dev/null +++ b/vendor/github.com/hashicorp/consul/testutil/README.md @@ -0,0 +1,65 @@ +Consul Testing Utilities +======================== + +This package provides some generic helpers to facilitate testing in Consul. + +TestServer +========== + +TestServer is a harness for managing Consul agents and initializing them with +test data. Using it, you can form test clusters, create services, add health +checks, manipulate the K/V store, etc. This test harness is completely decoupled +from Consul's core and API client, meaning it can be easily imported and used in +external unit tests for various applications. It works by invoking the Consul +CLI, which means it is a requirement to have Consul installed in the `$PATH`. + +Following is an example usage: + +```go +package my_program + +import ( + "testing" + + "github.com/hashicorp/consul/consul/structs" + "github.com/hashicorp/consul/testutil" +) + +func TestMain(t *testing.T) { + // Create a test Consul server + srv1 := testutil.NewTestServer(t) + defer srv1.Stop() + + // Create a secondary server, passing in configuration + // to avoid bootstrapping as we are forming a cluster. + srv2 := testutil.NewTestServerConfig(t, func(c *testutil.TestServerConfig) { + c.Bootstrap = false + }) + defer srv2.Stop() + + // Join the servers together + srv1.JoinLAN(srv2.LANAddr) + + // Create a test key/value pair + srv1.SetKV("foo", []byte("bar")) + + // Create lots of test key/value pairs + srv1.PopulateKV(map[string][]byte{ + "bar": []byte("123"), + "baz": []byte("456"), + }) + + // Create a service + srv1.AddService("redis", structs.HealthPassing, []string{"master"}) + + // Create a service check + srv1.AddCheck("service:redis", "redis", structs.HealthPassing) + + // Create a node check + srv1.AddCheck("mem", "", structs.HealthCritical) + + // The HTTPAddr field contains the address of the Consul + // API on the new test server instance. + println(srv1.HTTPAddr) +} +``` diff --git a/vendor/github.com/hashicorp/consul/testutil/server.go b/vendor/github.com/hashicorp/consul/testutil/server.go new file mode 100644 index 00000000000..aad60e3866b --- /dev/null +++ b/vendor/github.com/hashicorp/consul/testutil/server.go @@ -0,0 +1,494 @@ +package testutil + +// TestServer is a test helper. It uses a fork/exec model to create +// a test Consul server instance in the background and initialize it +// with some data and/or services. The test server can then be used +// to run a unit test, and offers an easy API to tear itself down +// when the test has completed. The only prerequisite is to have a consul +// binary available on the $PATH. +// +// This package does not use Consul's official API client. This is +// because we use TestServer to test the API client, which would +// otherwise cause an import cycle. + +import ( + "bytes" + "encoding/base64" + "encoding/json" + "fmt" + "io" + "io/ioutil" + "net" + "net/http" + "os" + "os/exec" + "strings" + "sync/atomic" + + "github.com/hashicorp/consul/consul/structs" + "github.com/hashicorp/go-cleanhttp" +) + +// offset is used to atomically increment the port numbers. +var offset uint64 + +// TestPerformanceConfig configures the performance parameters. +type TestPerformanceConfig struct { + RaftMultiplier uint `json:"raft_multiplier,omitempty"` +} + +// TestPortConfig configures the various ports used for services +// provided by the Consul server. +type TestPortConfig struct { + DNS int `json:"dns,omitempty"` + HTTP int `json:"http,omitempty"` + RPC int `json:"rpc,omitempty"` + SerfLan int `json:"serf_lan,omitempty"` + SerfWan int `json:"serf_wan,omitempty"` + Server int `json:"server,omitempty"` +} + +// TestAddressConfig contains the bind addresses for various +// components of the Consul server. +type TestAddressConfig struct { + HTTP string `json:"http,omitempty"` +} + +// TestServerConfig is the main server configuration struct. +type TestServerConfig struct { + NodeName string `json:"node_name"` + Performance *TestPerformanceConfig `json:"performance,omitempty"` + Bootstrap bool `json:"bootstrap,omitempty"` + Server bool `json:"server,omitempty"` + DataDir string `json:"data_dir,omitempty"` + Datacenter string `json:"datacenter,omitempty"` + DisableCheckpoint bool `json:"disable_update_check"` + LogLevel string `json:"log_level,omitempty"` + Bind string `json:"bind_addr,omitempty"` + Addresses *TestAddressConfig `json:"addresses,omitempty"` + Ports *TestPortConfig `json:"ports,omitempty"` + ACLMasterToken string `json:"acl_master_token,omitempty"` + ACLDatacenter string `json:"acl_datacenter,omitempty"` + ACLDefaultPolicy string `json:"acl_default_policy,omitempty"` + Stdout, Stderr io.Writer `json:"-"` +} + +// ServerConfigCallback is a function interface which can be +// passed to NewTestServerConfig to modify the server config. +type ServerConfigCallback func(c *TestServerConfig) + +// defaultServerConfig returns a new TestServerConfig struct +// with all of the listen ports incremented by one. +func defaultServerConfig() *TestServerConfig { + idx := int(atomic.AddUint64(&offset, 1)) + + return &TestServerConfig{ + NodeName: fmt.Sprintf("node%d", idx), + DisableCheckpoint: true, + Performance: &TestPerformanceConfig{ + RaftMultiplier: 1, + }, + Bootstrap: true, + Server: true, + LogLevel: "debug", + Bind: "127.0.0.1", + Addresses: &TestAddressConfig{}, + Ports: &TestPortConfig{ + DNS: 20000 + idx, + HTTP: 21000 + idx, + RPC: 22000 + idx, + SerfLan: 23000 + idx, + SerfWan: 24000 + idx, + Server: 25000 + idx, + }, + } +} + +// TestService is used to serialize a service definition. +type TestService struct { + ID string `json:",omitempty"` + Name string `json:",omitempty"` + Tags []string `json:",omitempty"` + Address string `json:",omitempty"` + Port int `json:",omitempty"` +} + +// TestCheck is used to serialize a check definition. +type TestCheck struct { + ID string `json:",omitempty"` + Name string `json:",omitempty"` + ServiceID string `json:",omitempty"` + TTL string `json:",omitempty"` +} + +// TestingT is an interface wrapper around TestingT +type TestingT interface { + Logf(format string, args ...interface{}) + Errorf(format string, args ...interface{}) + Fatalf(format string, args ...interface{}) + Fatal(args ...interface{}) + Skip(args ...interface{}) +} + +// TestKVResponse is what we use to decode KV data. +type TestKVResponse struct { + Value string +} + +// TestServer is the main server wrapper struct. +type TestServer struct { + cmd *exec.Cmd + Config *TestServerConfig + t TestingT + + HTTPAddr string + LANAddr string + WANAddr string + + HttpClient *http.Client +} + +// NewTestServer is an easy helper method to create a new Consul +// test server with the most basic configuration. +func NewTestServer(t TestingT) *TestServer { + return NewTestServerConfig(t, nil) +} + +// NewTestServerConfig creates a new TestServer, and makes a call to +// an optional callback function to modify the configuration. +func NewTestServerConfig(t TestingT, cb ServerConfigCallback) *TestServer { + if path, err := exec.LookPath("consul"); err != nil || path == "" { + t.Skip("consul not found on $PATH, skipping") + } + + dataDir, err := ioutil.TempDir("", "consul") + if err != nil { + t.Fatalf("err: %s", err) + } + + configFile, err := ioutil.TempFile(dataDir, "config") + if err != nil { + defer os.RemoveAll(dataDir) + t.Fatalf("err: %s", err) + } + + consulConfig := defaultServerConfig() + consulConfig.DataDir = dataDir + + if cb != nil { + cb(consulConfig) + } + + configContent, err := json.Marshal(consulConfig) + if err != nil { + t.Fatalf("err: %s", err) + } + + if _, err := configFile.Write(configContent); err != nil { + t.Fatalf("err: %s", err) + } + configFile.Close() + + stdout := io.Writer(os.Stdout) + if consulConfig.Stdout != nil { + stdout = consulConfig.Stdout + } + + stderr := io.Writer(os.Stderr) + if consulConfig.Stderr != nil { + stderr = consulConfig.Stderr + } + + // Start the server + cmd := exec.Command("consul", "agent", "-config-file", configFile.Name()) + cmd.Stdout = stdout + cmd.Stderr = stderr + if err := cmd.Start(); err != nil { + t.Fatalf("err: %s", err) + } + + var httpAddr string + var client *http.Client + if strings.HasPrefix(consulConfig.Addresses.HTTP, "unix://") { + httpAddr = consulConfig.Addresses.HTTP + trans := cleanhttp.DefaultTransport() + trans.Dial = func(_, _ string) (net.Conn, error) { + return net.Dial("unix", httpAddr[7:]) + } + client = &http.Client{ + Transport: trans, + } + } else { + httpAddr = fmt.Sprintf("127.0.0.1:%d", consulConfig.Ports.HTTP) + client = cleanhttp.DefaultClient() + } + + server := &TestServer{ + Config: consulConfig, + cmd: cmd, + t: t, + + HTTPAddr: httpAddr, + LANAddr: fmt.Sprintf("127.0.0.1:%d", consulConfig.Ports.SerfLan), + WANAddr: fmt.Sprintf("127.0.0.1:%d", consulConfig.Ports.SerfWan), + + HttpClient: client, + } + + // Wait for the server to be ready + if consulConfig.Bootstrap { + server.waitForLeader() + } else { + server.waitForAPI() + } + + return server +} + +// Stop stops the test Consul server, and removes the Consul data +// directory once we are done. +func (s *TestServer) Stop() { + defer os.RemoveAll(s.Config.DataDir) + + if err := s.cmd.Process.Kill(); err != nil { + s.t.Errorf("err: %s", err) + } + + // wait for the process to exit to be sure that the data dir can be + // deleted on all platforms. + s.cmd.Wait() +} + +// waitForAPI waits for only the agent HTTP endpoint to start +// responding. This is an indication that the agent has started, +// but will likely return before a leader is elected. +func (s *TestServer) waitForAPI() { + WaitForResult(func() (bool, error) { + resp, err := s.HttpClient.Get(s.url("/v1/agent/self")) + if err != nil { + return false, err + } + defer resp.Body.Close() + if err := s.requireOK(resp); err != nil { + return false, err + } + return true, nil + }, func(err error) { + defer s.Stop() + s.t.Fatalf("err: %s", err) + }) +} + +// waitForLeader waits for the Consul server's HTTP API to become +// available, and then waits for a known leader and an index of +// 1 or more to be observed to confirm leader election is done. +func (s *TestServer) waitForLeader() { + WaitForResult(func() (bool, error) { + // Query the API and check the status code + resp, err := s.HttpClient.Get(s.url("/v1/catalog/nodes")) + if err != nil { + return false, err + } + defer resp.Body.Close() + if err := s.requireOK(resp); err != nil { + return false, err + } + + // Ensure we have a leader and a node registration + if leader := resp.Header.Get("X-Consul-KnownLeader"); leader != "true" { + fmt.Println(leader) + return false, fmt.Errorf("Consul leader status: %#v", leader) + } + if resp.Header.Get("X-Consul-Index") == "0" { + return false, fmt.Errorf("Consul index is 0") + } + return true, nil + }, func(err error) { + defer s.Stop() + s.t.Fatalf("err: %s", err) + }) +} + +// url is a helper function which takes a relative URL and +// makes it into a proper URL against the local Consul server. +func (s *TestServer) url(path string) string { + return fmt.Sprintf("http://127.0.0.1:%d%s", s.Config.Ports.HTTP, path) +} + +// requireOK checks the HTTP response code and ensures it is acceptable. +func (s *TestServer) requireOK(resp *http.Response) error { + if resp.StatusCode != 200 { + return fmt.Errorf("Bad status code: %d", resp.StatusCode) + } + return nil +} + +// put performs a new HTTP PUT request. +func (s *TestServer) put(path string, body io.Reader) *http.Response { + req, err := http.NewRequest("PUT", s.url(path), body) + if err != nil { + s.t.Fatalf("err: %s", err) + } + resp, err := s.HttpClient.Do(req) + if err != nil { + s.t.Fatalf("err: %s", err) + } + if err := s.requireOK(resp); err != nil { + defer resp.Body.Close() + s.t.Fatal(err) + } + return resp +} + +// get performs a new HTTP GET request. +func (s *TestServer) get(path string) *http.Response { + resp, err := s.HttpClient.Get(s.url(path)) + if err != nil { + s.t.Fatalf("err: %s", err) + } + if err := s.requireOK(resp); err != nil { + defer resp.Body.Close() + s.t.Fatal(err) + } + return resp +} + +// encodePayload returns a new io.Reader wrapping the encoded contents +// of the payload, suitable for passing directly to a new request. +func (s *TestServer) encodePayload(payload interface{}) io.Reader { + var encoded bytes.Buffer + enc := json.NewEncoder(&encoded) + if err := enc.Encode(payload); err != nil { + s.t.Fatalf("err: %s", err) + } + return &encoded +} + +// JoinLAN is used to join nodes within the same datacenter. +func (s *TestServer) JoinLAN(addr string) { + resp := s.get("/v1/agent/join/" + addr) + resp.Body.Close() +} + +// JoinWAN is used to join remote datacenters together. +func (s *TestServer) JoinWAN(addr string) { + resp := s.get("/v1/agent/join/" + addr + "?wan=1") + resp.Body.Close() +} + +// SetKV sets an individual key in the K/V store. +func (s *TestServer) SetKV(key string, val []byte) { + resp := s.put("/v1/kv/"+key, bytes.NewBuffer(val)) + resp.Body.Close() +} + +// GetKV retrieves a single key and returns its value +func (s *TestServer) GetKV(key string) []byte { + resp := s.get("/v1/kv/" + key) + defer resp.Body.Close() + + raw, err := ioutil.ReadAll(resp.Body) + if err != nil { + s.t.Fatalf("err: %s", err) + } + + var result []*TestKVResponse + if err := json.Unmarshal(raw, &result); err != nil { + s.t.Fatalf("err: %s", err) + } + if len(result) < 1 { + s.t.Fatalf("key does not exist: %s", key) + } + + v, err := base64.StdEncoding.DecodeString(result[0].Value) + if err != nil { + s.t.Fatalf("err: %s", err) + } + + return v +} + +// PopulateKV fills the Consul KV with data from a generic map. +func (s *TestServer) PopulateKV(data map[string][]byte) { + for k, v := range data { + s.SetKV(k, v) + } +} + +// ListKV returns a list of keys present in the KV store. This will list all +// keys under the given prefix recursively and return them as a slice. +func (s *TestServer) ListKV(prefix string) []string { + resp := s.get("/v1/kv/" + prefix + "?keys") + defer resp.Body.Close() + + raw, err := ioutil.ReadAll(resp.Body) + if err != nil { + s.t.Fatalf("err: %s", err) + } + + var result []string + if err := json.Unmarshal(raw, &result); err != nil { + s.t.Fatalf("err: %s", err) + } + return result +} + +// AddService adds a new service to the Consul instance. It also +// automatically adds a health check with the given status, which +// can be one of "passing", "warning", or "critical". +func (s *TestServer) AddService(name, status string, tags []string) { + svc := &TestService{ + Name: name, + Tags: tags, + } + payload := s.encodePayload(svc) + s.put("/v1/agent/service/register", payload) + + chkName := "service:" + name + chk := &TestCheck{ + Name: chkName, + ServiceID: name, + TTL: "10m", + } + payload = s.encodePayload(chk) + s.put("/v1/agent/check/register", payload) + + switch status { + case structs.HealthPassing: + s.put("/v1/agent/check/pass/"+chkName, nil) + case structs.HealthWarning: + s.put("/v1/agent/check/warn/"+chkName, nil) + case structs.HealthCritical: + s.put("/v1/agent/check/fail/"+chkName, nil) + default: + s.t.Fatalf("Unrecognized status: %s", status) + } +} + +// AddCheck adds a check to the Consul instance. If the serviceID is +// left empty (""), then the check will be associated with the node. +// The check status may be "passing", "warning", or "critical". +func (s *TestServer) AddCheck(name, serviceID, status string) { + chk := &TestCheck{ + ID: name, + Name: name, + TTL: "10m", + } + if serviceID != "" { + chk.ServiceID = serviceID + } + + payload := s.encodePayload(chk) + s.put("/v1/agent/check/register", payload) + + switch status { + case structs.HealthPassing: + s.put("/v1/agent/check/pass/"+name, nil) + case structs.HealthWarning: + s.put("/v1/agent/check/warn/"+name, nil) + case structs.HealthCritical: + s.put("/v1/agent/check/fail/"+name, nil) + default: + s.t.Fatalf("Unrecognized status: %s", status) + } +} diff --git a/vendor/github.com/hashicorp/consul/testutil/wait.go b/vendor/github.com/hashicorp/consul/testutil/wait.go new file mode 100644 index 00000000000..ae2439437ba --- /dev/null +++ b/vendor/github.com/hashicorp/consul/testutil/wait.go @@ -0,0 +1,44 @@ +package testutil + +import ( + "github.com/hashicorp/consul/consul/structs" + "testing" + "time" +) + +type testFn func() (bool, error) +type errorFn func(error) + +func WaitForResult(test testFn, error errorFn) { + retries := 1000 + + for retries > 0 { + time.Sleep(10 * time.Millisecond) + retries-- + + success, err := test() + if success { + return + } + + if retries == 0 { + error(err) + } + } +} + +type rpcFn func(string, interface{}, interface{}) error + +func WaitForLeader(t *testing.T, rpc rpcFn, dc string) structs.IndexedNodes { + var out structs.IndexedNodes + WaitForResult(func() (bool, error) { + args := &structs.DCSpecificRequest{ + Datacenter: dc, + } + err := rpc("Catalog.ListNodes", args, &out) + return out.QueryMeta.KnownLeader && out.Index > 0, err + }, func(err error) { + t.Fatalf("failed to find leader: %v", err) + }) + return out +} diff --git a/vendor/github.com/hashicorp/consul/types/README.md b/vendor/github.com/hashicorp/consul/types/README.md new file mode 100644 index 00000000000..da662f4a1c3 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/types/README.md @@ -0,0 +1,39 @@ +# Consul `types` Package + +The Go language has a strong type system built into the language. The +`types` package corrals named types into a single package that is terminal in +`go`'s import graph. The `types` package should not have any downstream +dependencies. Each subsystem that defines its own set of types exists in its +own file, but all types are defined in the same package. + +# Why + +> Everything should be made as simple as possible, but not simpler. + +`string` is a useful container and underlying type for identifiers, however +the `string` type is effectively opaque to the compiler in terms of how a +given string is intended to be used. For instance, there is nothing +preventing the following from happening: + +```go +// `map` of Widgets, looked up by ID +var widgetLookup map[string]*Widget +// ... +var widgetID string = "widgetID" +w, found := widgetLookup[widgetID] + +// Bad! +var widgetName string = "name of widget" +w, found := widgetLookup[widgetName] +``` + +but this class of problem is entirely preventable: + +```go +type WidgetID string +var widgetLookup map[WidgetID]*Widget +var widgetName +``` + +TL;DR: intentions and idioms aren't statically checked by compilers. The +`types` package uses Go's strong type system to prevent this class of bug. diff --git a/vendor/github.com/hashicorp/consul/types/checks.go b/vendor/github.com/hashicorp/consul/types/checks.go new file mode 100644 index 00000000000..25a136b4f4b --- /dev/null +++ b/vendor/github.com/hashicorp/consul/types/checks.go @@ -0,0 +1,5 @@ +package types + +// CheckID is a strongly typed string used to uniquely represent a Consul +// Check on an Agent (a CheckID is not globally unique). +type CheckID string diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/LICENSE b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/LICENSE new file mode 100644 index 00000000000..c33dcc7c928 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/LICENSE @@ -0,0 +1,354 @@ +Mozilla Public License, version 2.0 + +1. Definitions + +1.1. “Contributor” + + means each individual or legal entity that creates, contributes to the + creation of, or owns Covered Software. + +1.2. “Contributor Version” + + means the combination of the Contributions of others (if any) used by a + Contributor and that particular Contributor’s Contribution. + +1.3. “Contribution” + + means Covered Software of a particular Contributor. + +1.4. “Covered Software” + + means Source Code Form to which the initial Contributor has attached the + notice in Exhibit A, the Executable Form of such Source Code Form, and + Modifications of such Source Code Form, in each case including portions + thereof. + +1.5. “Incompatible With Secondary Licenses” + means + + a. that the initial Contributor has attached the notice described in + Exhibit B to the Covered Software; or + + b. that the Covered Software was made available under the terms of version + 1.1 or earlier of the License, but not also under the terms of a + Secondary License. + +1.6. “Executable Form” + + means any form of the work other than Source Code Form. + +1.7. “Larger Work” + + means a work that combines Covered Software with other material, in a separate + file or files, that is not Covered Software. + +1.8. “License” + + means this document. + +1.9. “Licensable” + + means having the right to grant, to the maximum extent possible, whether at the + time of the initial grant or subsequently, any and all of the rights conveyed by + this License. + +1.10. “Modifications” + + means any of the following: + + a. any file in Source Code Form that results from an addition to, deletion + from, or modification of the contents of Covered Software; or + + b. any new file in Source Code Form that contains any Covered Software. + +1.11. “Patent Claims” of a Contributor + + means any patent claim(s), including without limitation, method, process, + and apparatus claims, in any patent Licensable by such Contributor that + would be infringed, but for the grant of the License, by the making, + using, selling, offering for sale, having made, import, or transfer of + either its Contributions or its Contributor Version. + +1.12. “Secondary License” + + means either the GNU General Public License, Version 2.0, the GNU Lesser + General Public License, Version 2.1, the GNU Affero General Public + License, Version 3.0, or any later versions of those licenses. + +1.13. “Source Code Form” + + means the form of the work preferred for making modifications. + +1.14. “You” (or “Your”) + + means an individual or a legal entity exercising rights under this + License. For legal entities, “You” includes any entity that controls, is + controlled by, or is under common control with You. For purposes of this + definition, “control” means (a) the power, direct or indirect, to cause + the direction or management of such entity, whether by contract or + otherwise, or (b) ownership of more than fifty percent (50%) of the + outstanding shares or beneficial ownership of such entity. + + +2. License Grants and Conditions + +2.1. Grants + + Each Contributor hereby grants You a world-wide, royalty-free, + non-exclusive license: + + a. under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or as + part of a Larger Work; and + + b. under Patent Claims of such Contributor to make, use, sell, offer for + sale, have made, import, and otherwise transfer either its Contributions + or its Contributor Version. + +2.2. Effective Date + + The licenses granted in Section 2.1 with respect to any Contribution become + effective for each Contribution on the date the Contributor first distributes + such Contribution. + +2.3. Limitations on Grant Scope + + The licenses granted in this Section 2 are the only rights granted under this + License. No additional rights or licenses will be implied from the distribution + or licensing of Covered Software under this License. Notwithstanding Section + 2.1(b) above, no patent license is granted by a Contributor: + + a. for any code that a Contributor has removed from Covered Software; or + + b. for infringements caused by: (i) Your and any other third party’s + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + + c. under Patent Claims infringed by Covered Software in the absence of its + Contributions. + + This License does not grant any rights in the trademarks, service marks, or + logos of any Contributor (except as may be necessary to comply with the + notice requirements in Section 3.4). + +2.4. Subsequent Licenses + + No Contributor makes additional grants as a result of Your choice to + distribute the Covered Software under a subsequent version of this License + (see Section 10.2) or under the terms of a Secondary License (if permitted + under the terms of Section 3.3). + +2.5. Representation + + Each Contributor represents that the Contributor believes its Contributions + are its original creation(s) or it has sufficient rights to grant the + rights to its Contributions conveyed by this License. + +2.6. Fair Use + + This License is not intended to limit any rights You have under applicable + copyright doctrines of fair use, fair dealing, or other equivalents. + +2.7. Conditions + + Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in + Section 2.1. + + +3. Responsibilities + +3.1. Distribution of Source Form + + All distribution of Covered Software in Source Code Form, including any + Modifications that You create or to which You contribute, must be under the + terms of this License. You must inform recipients that the Source Code Form + of the Covered Software is governed by the terms of this License, and how + they can obtain a copy of this License. You may not attempt to alter or + restrict the recipients’ rights in the Source Code Form. + +3.2. Distribution of Executable Form + + If You distribute Covered Software in Executable Form then: + + a. such Covered Software must also be made available in Source Code Form, + as described in Section 3.1, and You must inform recipients of the + Executable Form how they can obtain a copy of such Source Code Form by + reasonable means in a timely manner, at a charge no more than the cost + of distribution to the recipient; and + + b. You may distribute such Executable Form under the terms of this License, + or sublicense it under different terms, provided that the license for + the Executable Form does not attempt to limit or alter the recipients’ + rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + + You may create and distribute a Larger Work under terms of Your choice, + provided that You also comply with the requirements of this License for the + Covered Software. If the Larger Work is a combination of Covered Software + with a work governed by one or more Secondary Licenses, and the Covered + Software is not Incompatible With Secondary Licenses, this License permits + You to additionally distribute such Covered Software under the terms of + such Secondary License(s), so that the recipient of the Larger Work may, at + their option, further distribute the Covered Software under the terms of + either this License or such Secondary License(s). + +3.4. Notices + + You may not remove or alter the substance of any license notices (including + copyright notices, patent notices, disclaimers of warranty, or limitations + of liability) contained within the Source Code Form of the Covered + Software, except that You may alter any license notices to the extent + required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + + You may choose to offer, and to charge a fee for, warranty, support, + indemnity or liability obligations to one or more recipients of Covered + Software. However, You may do so only on Your own behalf, and not on behalf + of any Contributor. You must make it absolutely clear that any such + warranty, support, indemnity, or liability obligation is offered by You + alone, and You hereby agree to indemnify every Contributor for any + liability incurred by such Contributor as a result of warranty, support, + indemnity or liability terms You offer. You may include additional + disclaimers of warranty and limitations of liability specific to any + jurisdiction. + +4. Inability to Comply Due to Statute or Regulation + + If it is impossible for You to comply with any of the terms of this License + with respect to some or all of the Covered Software due to statute, judicial + order, or regulation then You must: (a) comply with the terms of this License + to the maximum extent possible; and (b) describe the limitations and the code + they affect. Such description must be placed in a text file included with all + distributions of the Covered Software under this License. Except to the + extent prohibited by statute or regulation, such description must be + sufficiently detailed for a recipient of ordinary skill to be able to + understand it. + +5. Termination + +5.1. The rights granted under this License will terminate automatically if You + fail to comply with any of its terms. However, if You become compliant, + then the rights granted under this License from a particular Contributor + are reinstated (a) provisionally, unless and until such Contributor + explicitly and finally terminates Your grants, and (b) on an ongoing basis, + if such Contributor fails to notify You of the non-compliance by some + reasonable means prior to 60 days after You have come back into compliance. + Moreover, Your grants from a particular Contributor are reinstated on an + ongoing basis if such Contributor notifies You of the non-compliance by + some reasonable means, this is the first time You have received notice of + non-compliance with this License from such Contributor, and You become + compliant prior to 30 days after Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent + infringement claim (excluding declaratory judgment actions, counter-claims, + and cross-claims) alleging that a Contributor Version directly or + indirectly infringes any patent, then the rights granted to You by any and + all Contributors for the Covered Software under Section 2.1 of this License + shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user + license agreements (excluding distributors and resellers) which have been + validly granted by You or Your distributors under this License prior to + termination shall survive termination. + +6. Disclaimer of Warranty + + Covered Software is provided under this License on an “as is” basis, without + warranty of any kind, either expressed, implied, or statutory, including, + without limitation, warranties that the Covered Software is free of defects, + merchantable, fit for a particular purpose or non-infringing. The entire + risk as to the quality and performance of the Covered Software is with You. + Should any Covered Software prove defective in any respect, You (not any + Contributor) assume the cost of any necessary servicing, repair, or + correction. This disclaimer of warranty constitutes an essential part of this + License. No use of any Covered Software is authorized under this License + except under this disclaimer. + +7. Limitation of Liability + + Under no circumstances and under no legal theory, whether tort (including + negligence), contract, or otherwise, shall any Contributor, or anyone who + distributes Covered Software as permitted above, be liable to You for any + direct, indirect, special, incidental, or consequential damages of any + character including, without limitation, damages for lost profits, loss of + goodwill, work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses, even if such party shall have been + informed of the possibility of such damages. This limitation of liability + shall not apply to liability for death or personal injury resulting from such + party’s negligence to the extent applicable law prohibits such limitation. + Some jurisdictions do not allow the exclusion or limitation of incidental or + consequential damages, so this exclusion and limitation may not apply to You. + +8. Litigation + + Any litigation relating to this License may be brought only in the courts of + a jurisdiction where the defendant maintains its principal place of business + and such litigation shall be governed by laws of that jurisdiction, without + reference to its conflict-of-law provisions. Nothing in this Section shall + prevent a party’s ability to bring cross-claims or counter-claims. + +9. Miscellaneous + + This License represents the complete agreement concerning the subject matter + hereof. If any provision of this License is held to be unenforceable, such + provision shall be reformed only to the extent necessary to make it + enforceable. Any law or regulation which provides that the language of a + contract shall be construed against the drafter shall not be used to construe + this License against a Contributor. + + +10. Versions of the License + +10.1. New Versions + + Mozilla Foundation is the license steward. Except as provided in Section + 10.3, no one other than the license steward has the right to modify or + publish new versions of this License. Each version will be given a + distinguishing version number. + +10.2. Effect of New Versions + + You may distribute the Covered Software under the terms of the version of + the License under which You originally received the Covered Software, or + under the terms of any subsequent version published by the license + steward. + +10.3. Modified Versions + + If you create software not governed by this License, and you want to + create a new license for such software, you may create and use a modified + version of this License if you rename the license and remove any + references to the name of the license steward (except to note that such + modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary Licenses + If You choose to distribute Source Code Form that is Incompatible With + Secondary Licenses under the terms of this version of the License, the + notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice + + This Source Code Form is subject to the + terms of the Mozilla Public License, v. + 2.0. If a copy of the MPL was not + distributed with this file, You can + obtain one at + http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular file, then +You may include the notice in a location (such as a LICENSE file in a relevant +directory) where a recipient would be likely to look for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - “Incompatible With Secondary Licenses” Notice + + This Source Code Form is “Incompatible + With Secondary Licenses”, as defined by + the Mozilla Public License, v. 2.0. + diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/Makefile b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/Makefile new file mode 100644 index 00000000000..61499c50760 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/Makefile @@ -0,0 +1,17 @@ +DEPS = $(go list -f '{{range .TestImports}}{{.}} {{end}}' ./...) + +test: + go test -timeout=45s ./... + +integ: test + INTEG_TESTS=yes go test -timeout=3s -run=Integ ./... + +deps: + go get -d -v ./... + echo $(DEPS) | xargs -n1 go get -d + +cov: + INTEG_TESTS=yes gocov test github.com/hashicorp/raft | gocov-html > /tmp/coverage.html + open /tmp/coverage.html + +.PHONY: test cov integ deps diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/README.md b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/README.md new file mode 100644 index 00000000000..8778b13dc5c --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/README.md @@ -0,0 +1,89 @@ +raft [![Build Status](https://travis-ci.org/hashicorp/raft.png)](https://travis-ci.org/hashicorp/raft) +==== + +raft is a [Go](http://www.golang.org) library that manages a replicated +log and can be used with an FSM to manage replicated state machines. It +is library for providing [consensus](http://en.wikipedia.org/wiki/Consensus_(computer_science)). + +The use cases for such a library are far-reaching as replicated state +machines are a key component of many distributed systems. They enable +building Consistent, Partition Tolerant (CP) systems, with limited +fault tolerance as well. + +## Building + +If you wish to build raft you'll need Go version 1.2+ installed. + +Please check your installation with: + +``` +go version +``` + +## Documentation + +For complete documentation, see the associated [Godoc](http://godoc.org/github.com/hashicorp/raft). + +To prevent complications with cgo, the primary backend `MDBStore` is in a separate repository, +called [raft-mdb](http://github.com/hashicorp/raft-mdb). That is the recommended implementation +for the `LogStore` and `StableStore`. + +A pure Go backend using [BoltDB](https://github.com/boltdb/bolt) is also available called +[raft-boltdb](https://github.com/hashicorp/raft-boltdb). It can also be used as a `LogStore` +and `StableStore`. + +## Protocol + +raft is based on ["Raft: In Search of an Understandable Consensus Algorithm"](https://ramcloud.stanford.edu/wiki/download/attachments/11370504/raft.pdf) + +A high level overview of the Raft protocol is described below, but for details please read the full +[Raft paper](https://ramcloud.stanford.edu/wiki/download/attachments/11370504/raft.pdf) +followed by the raft source. Any questions about the raft protocol should be sent to the +[raft-dev mailing list](https://groups.google.com/forum/#!forum/raft-dev). + +### Protocol Description + +Raft nodes are always in one of three states: follower, candidate or leader. All +nodes initially start out as a follower. In this state, nodes can accept log entries +from a leader and cast votes. If no entries are received for some time, nodes +self-promote to the candidate state. In the candidate state nodes request votes from +their peers. If a candidate receives a quorum of votes, then it is promoted to a leader. +The leader must accept new log entries and replicate to all the other followers. +In addition, if stale reads are not acceptable, all queries must also be performed on +the leader. + +Once a cluster has a leader, it is able to accept new log entries. A client can +request that a leader append a new log entry, which is an opaque binary blob to +Raft. The leader then writes the entry to durable storage and attempts to replicate +to a quorum of followers. Once the log entry is considered *committed*, it can be +*applied* to a finite state machine. The finite state machine is application specific, +and is implemented using an interface. + +An obvious question relates to the unbounded nature of a replicated log. Raft provides +a mechanism by which the current state is snapshotted, and the log is compacted. Because +of the FSM abstraction, restoring the state of the FSM must result in the same state +as a replay of old logs. This allows Raft to capture the FSM state at a point in time, +and then remove all the logs that were used to reach that state. This is performed automatically +without user intervention, and prevents unbounded disk usage as well as minimizing +time spent replaying logs. + +Lastly, there is the issue of updating the peer set when new servers are joining +or existing servers are leaving. As long as a quorum of nodes is available, this +is not an issue as Raft provides mechanisms to dynamically update the peer set. +If a quorum of nodes is unavailable, then this becomes a very challenging issue. +For example, suppose there are only 2 peers, A and B. The quorum size is also +2, meaning both nodes must agree to commit a log entry. If either A or B fails, +it is now impossible to reach quorum. This means the cluster is unable to add, +or remove a node, or commit any additional log entries. This results in *unavailability*. +At this point, manual intervention would be required to remove either A or B, +and to restart the remaining node in bootstrap mode. + +A Raft cluster of 3 nodes can tolerate a single node failure, while a cluster +of 5 can tolerate 2 node failures. The recommended configuration is to either +run 3 or 5 raft servers. This maximizes availability without +greatly sacrificing performance. + +In terms of performance, Raft is comparable to Paxos. Assuming stable leadership, +committing a log entry requires a single round trip to half of the cluster. +Thus performance is bound by disk I/O and network latency. + diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/api.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/api.go new file mode 100644 index 00000000000..ff14131c4aa --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/api.go @@ -0,0 +1,935 @@ +package raft + +import ( + "errors" + "fmt" + "log" + "os" + "strconv" + "sync" + "time" + + "github.com/armon/go-metrics" +) + +var ( + // ErrLeader is returned when an operation can't be completed on a + // leader node. + ErrLeader = errors.New("node is the leader") + + // ErrNotLeader is returned when an operation can't be completed on a + // follower or candidate node. + ErrNotLeader = errors.New("node is not the leader") + + // ErrLeadershipLost is returned when a leader fails to commit a log entry + // because it's been deposed in the process. + ErrLeadershipLost = errors.New("leadership lost while committing log") + + // ErrRaftShutdown is returned when operations are requested against an + // inactive Raft. + ErrRaftShutdown = errors.New("raft is already shutdown") + + // ErrEnqueueTimeout is returned when a command fails due to a timeout. + ErrEnqueueTimeout = errors.New("timed out enqueuing operation") + + // ErrNothingNewToSnapshot is returned when trying to create a snapshot + // but there's nothing new commited to the FSM since we started. + ErrNothingNewToSnapshot = errors.New("nothing new to snapshot") + + // ErrUnsupportedProtocol is returned when an operation is attempted + // that's not supported by the current protocol version. + ErrUnsupportedProtocol = errors.New("operation not supported with current protocol version") + + // ErrCantBootstrap is returned when attempt is made to bootstrap a + // cluster that already has state present. + ErrCantBootstrap = errors.New("bootstrap only works on new clusters") +) + +// Raft implements a Raft node. +type Raft struct { + raftState + + // protocolVersion is used to inter-operate with Raft servers running + // different versions of the library. See comments in config.go for more + // details. + protocolVersion ProtocolVersion + + // applyCh is used to async send logs to the main thread to + // be committed and applied to the FSM. + applyCh chan *logFuture + + // Configuration provided at Raft initialization + conf Config + + // FSM is the client state machine to apply commands to + fsm FSM + + // fsmCommitCh is used to trigger async application of logs to the fsm + fsmCommitCh chan commitTuple + + // fsmRestoreCh is used to trigger a restore from snapshot + fsmRestoreCh chan *restoreFuture + + // fsmSnapshotCh is used to trigger a new snapshot being taken + fsmSnapshotCh chan *reqSnapshotFuture + + // lastContact is the last time we had contact from the + // leader node. This can be used to gauge staleness. + lastContact time.Time + lastContactLock sync.RWMutex + + // Leader is the current cluster leader + leader ServerAddress + leaderLock sync.RWMutex + + // leaderCh is used to notify of leadership changes + leaderCh chan bool + + // leaderState used only while state is leader + leaderState leaderState + + // Stores our local server ID, used to avoid sending RPCs to ourself + localID ServerID + + // Stores our local addr + localAddr ServerAddress + + // Used for our logging + logger *log.Logger + + // LogStore provides durable storage for logs + logs LogStore + + // Used to request the leader to make configuration changes. + configurationChangeCh chan *configurationChangeFuture + + // Tracks the latest configuration and latest committed configuration from + // the log/snapshot. + configurations configurations + + // RPC chan comes from the transport layer + rpcCh <-chan RPC + + // Shutdown channel to exit, protected to prevent concurrent exits + shutdown bool + shutdownCh chan struct{} + shutdownLock sync.Mutex + + // snapshots is used to store and retrieve snapshots + snapshots SnapshotStore + + // snapshotCh is used for user triggered snapshots + snapshotCh chan *snapshotFuture + + // stable is a StableStore implementation for durable state + // It provides stable storage for many fields in raftState + stable StableStore + + // The transport layer we use + trans Transport + + // verifyCh is used to async send verify futures to the main thread + // to verify we are still the leader + verifyCh chan *verifyFuture + + // configurationsCh is used to get the configuration data safely from + // outside of the main thread. + configurationsCh chan *configurationsFuture + + // bootstrapCh is used to attempt an initial bootstrap from outside of + // the main thread. + bootstrapCh chan *bootstrapFuture + + // List of observers and the mutex that protects them. The observers list + // is indexed by an artificial ID which is used for deregistration. + observersLock sync.RWMutex + observers map[uint64]*Observer +} + +// BootstrapCluster initializes a server's storage with the given cluster +// configuration. This should only be called at the beginning of time for the +// cluster, and you absolutely must make sure that you call it with the same +// configuration on all the Voter servers. There is no need to bootstrap +// Nonvoter and Staging servers. +// +// One sane approach is to boostrap a single server with a configuration +// listing just itself as a Voter, then invoke AddVoter() on it to add other +// servers to the cluster. +func BootstrapCluster(conf *Config, logs LogStore, stable StableStore, + snaps SnapshotStore, trans Transport, configuration Configuration) error { + // Validate the Raft server config. + if err := ValidateConfig(conf); err != nil { + return err + } + + // Sanity check the Raft peer configuration. + if err := checkConfiguration(configuration); err != nil { + return err + } + + // Make sure the cluster is in a clean state. + hasState, err := HasExistingState(logs, stable, snaps) + if err != nil { + return fmt.Errorf("failed to check for existing state: %v", err) + } + if hasState { + return ErrCantBootstrap + } + + // Set current term to 1. + if err := stable.SetUint64(keyCurrentTerm, 1); err != nil { + return fmt.Errorf("failed to save current term: %v", err) + } + + // Append configuration entry to log. + entry := &Log{ + Index: 1, + Term: 1, + } + if conf.ProtocolVersion < 3 { + entry.Type = LogRemovePeerDeprecated + entry.Data = encodePeers(configuration, trans) + } else { + entry.Type = LogConfiguration + entry.Data = encodeConfiguration(configuration) + } + if err := logs.StoreLog(entry); err != nil { + return fmt.Errorf("failed to append configuration entry to log: %v", err) + } + + return nil +} + +// RecoverCluster is used to manually force a new configuration in order to +// recover from a loss of quorum where the current configuration cannot be +// restored, such as when several servers die at the same time. This works by +// reading all the current state for this server, creating a snapshot with the +// supplied configuration, and then truncating the Raft log. This is the only +// safe way to force a given configuration without actually altering the log to +// insert any new entries, which could cause conflicts with other servers with +// different state. +// +// WARNING! This operation implicitly commits all entries in the Raft log, so +// in general this is an extremely unsafe operation. If you've lost your other +// servers and are performing a manual recovery, then you've also lost the +// commit information, so this is likely the best you can do, but you should be +// aware that calling this can cause Raft log entries that were in the process +// of being replicated but not yet be committed to be committed. +// +// Note the FSM passed here is used for the snapshot operations and will be +// left in a state that should not be used by the application. Be sure to +// discard this FSM and any associated state and provide a fresh one when +// calling NewRaft later. +// +// A typical way to recover the cluster is to shut down all servers and then +// run RecoverCluster on every server using an identical configuration. When +// the cluster is then restarted, and election should occur and then Raft will +// resume normal operation. If it's desired to make a particular server the +// leader, this can be used to inject a new configuration with that server as +// the sole voter, and then join up other new clean-state peer servers using +// the usual APIs in order to bring the cluster back into a known state. +func RecoverCluster(conf *Config, fsm FSM, logs LogStore, stable StableStore, + snaps SnapshotStore, trans Transport, configuration Configuration) error { + // Validate the Raft server config. + if err := ValidateConfig(conf); err != nil { + return err + } + + // Sanity check the Raft peer configuration. + if err := checkConfiguration(configuration); err != nil { + return err + } + + // Refuse to recover if there's no existing state. This would be safe to + // do, but it is likely an indication of an operator error where they + // expect data to be there and it's not. By refusing, we force them + // to show intent to start a cluster fresh by explicitly doing a + // bootstrap, rather than quietly fire up a fresh cluster here. + hasState, err := HasExistingState(logs, stable, snaps) + if err != nil { + return fmt.Errorf("failed to check for existing state: %v", err) + } + if !hasState { + return fmt.Errorf("refused to recover cluster with no initial state, this is probably an operator error") + } + + // Attempt to restore any snapshots we find, newest to oldest. + var snapshotIndex uint64 + var snapshotTerm uint64 + snapshots, err := snaps.List() + if err != nil { + return fmt.Errorf("failed to list snapshots: %v", err) + } + for _, snapshot := range snapshots { + _, source, err := snaps.Open(snapshot.ID) + if err != nil { + // Skip this one and try the next. We will detect if we + // couldn't open any snapshots. + continue + } + defer source.Close() + + if err := fsm.Restore(source); err != nil { + // Same here, skip and try the next one. + continue + } + + snapshotIndex = snapshot.Index + snapshotTerm = snapshot.Term + break + } + if len(snapshots) > 0 && (snapshotIndex == 0 || snapshotTerm == 0) { + return fmt.Errorf("failed to restore any of the available snapshots") + } + + // The snapshot information is the best known end point for the data + // until we play back the Raft log entries. + lastIndex := snapshotIndex + lastTerm := snapshotTerm + + // Apply any Raft log entries past the snapshot. + lastLogIndex, err := logs.LastIndex() + if err != nil { + return fmt.Errorf("failed to find last log: %v", err) + } + for index := snapshotIndex + 1; index <= lastLogIndex; index++ { + var entry Log + if err := logs.GetLog(index, &entry); err != nil { + return fmt.Errorf("failed to get log at index %d: %v", index, err) + } + if entry.Type == LogCommand { + _ = fsm.Apply(&entry) + } + lastIndex = entry.Index + lastTerm = entry.Term + } + + // Create a new snapshot, placing the configuration in as if it was + // committed at index 1. + snapshot, err := fsm.Snapshot() + if err != nil { + return fmt.Errorf("failed to snapshot FSM: %v", err) + } + version := getSnapshotVersion(conf.ProtocolVersion) + sink, err := snaps.Create(version, lastIndex, lastTerm, configuration, 1, trans) + if err != nil { + return fmt.Errorf("failed to create snapshot: %v", err) + } + if err := snapshot.Persist(sink); err != nil { + return fmt.Errorf("failed to persist snapshot: %v", err) + } + if err := sink.Close(); err != nil { + return fmt.Errorf("failed to finalize snapshot: %v", err) + } + + // Compact the log so that we don't get bad interference from any + // configuration change log entries that might be there. + firstLogIndex, err := logs.FirstIndex() + if err != nil { + return fmt.Errorf("failed to get first log index: %v", err) + } + if err := logs.DeleteRange(firstLogIndex, lastLogIndex); err != nil { + return fmt.Errorf("log compaction failed: %v", err) + } + + return nil +} + +// HasExistingState returns true if the server has any existing state (logs, +// knowledge of a current term, or any snapshots). +func HasExistingState(logs LogStore, stable StableStore, snaps SnapshotStore) (bool, error) { + // Make sure we don't have a current term. + currentTerm, err := stable.GetUint64(keyCurrentTerm) + if err == nil { + if currentTerm > 0 { + return true, nil + } + } else { + if err.Error() != "not found" { + return false, fmt.Errorf("failed to read current term: %v", err) + } + } + + // Make sure we have an empty log. + lastIndex, err := logs.LastIndex() + if err != nil { + return false, fmt.Errorf("failed to get last log index: %v", err) + } + if lastIndex > 0 { + return true, nil + } + + // Make sure we have no snapshots + snapshots, err := snaps.List() + if err != nil { + return false, fmt.Errorf("failed to list snapshots: %v", err) + } + if len(snapshots) > 0 { + return true, nil + } + + return false, nil +} + +// NewRaft is used to construct a new Raft node. It takes a configuration, as well +// as implementations of various interfaces that are required. If we have any +// old state, such as snapshots, logs, peers, etc, all those will be restored +// when creating the Raft node. +func NewRaft(conf *Config, fsm FSM, logs LogStore, stable StableStore, snaps SnapshotStore, trans Transport) (*Raft, error) { + // Validate the configuration. + if err := ValidateConfig(conf); err != nil { + return nil, err + } + + // Ensure we have a LogOutput. + var logger *log.Logger + if conf.Logger != nil { + logger = conf.Logger + } else { + if conf.LogOutput == nil { + conf.LogOutput = os.Stderr + } + logger = log.New(conf.LogOutput, "", log.LstdFlags) + } + + // Try to restore the current term. + currentTerm, err := stable.GetUint64(keyCurrentTerm) + if err != nil && err.Error() != "not found" { + return nil, fmt.Errorf("failed to load current term: %v", err) + } + + // Read the index of the last log entry. + lastIndex, err := logs.LastIndex() + if err != nil { + return nil, fmt.Errorf("failed to find last log: %v", err) + } + + // Get the last log entry. + var lastLog Log + if lastIndex > 0 { + if err = logs.GetLog(lastIndex, &lastLog); err != nil { + return nil, fmt.Errorf("failed to get last log at index %d: %v", lastIndex, err) + } + } + + // Make sure we have a valid server address and ID. + protocolVersion := conf.ProtocolVersion + localAddr := ServerAddress(trans.LocalAddr()) + localID := conf.LocalID + + // TODO (slackpad) - When we deprecate protocol version 2, remove this + // along with the AddPeer() and RemovePeer() APIs. + if protocolVersion < 3 && string(localID) != string(localAddr) { + return nil, fmt.Errorf("when running with ProtocolVersion < 3, LocalID must be set to the network address") + } + + // Create Raft struct. + r := &Raft{ + protocolVersion: protocolVersion, + applyCh: make(chan *logFuture), + conf: *conf, + fsm: fsm, + fsmCommitCh: make(chan commitTuple, 128), + fsmRestoreCh: make(chan *restoreFuture), + fsmSnapshotCh: make(chan *reqSnapshotFuture), + leaderCh: make(chan bool), + localID: localID, + localAddr: localAddr, + logger: logger, + logs: logs, + configurationChangeCh: make(chan *configurationChangeFuture), + configurations: configurations{}, + rpcCh: trans.Consumer(), + snapshots: snaps, + snapshotCh: make(chan *snapshotFuture), + shutdownCh: make(chan struct{}), + stable: stable, + trans: trans, + verifyCh: make(chan *verifyFuture, 64), + configurationsCh: make(chan *configurationsFuture, 8), + bootstrapCh: make(chan *bootstrapFuture), + observers: make(map[uint64]*Observer), + } + + // Initialize as a follower. + r.setState(Follower) + + // Start as leader if specified. This should only be used + // for testing purposes. + if conf.StartAsLeader { + r.setState(Leader) + r.setLeader(r.localAddr) + } + + // Restore the current term and the last log. + r.setCurrentTerm(currentTerm) + r.setLastLog(lastLog.Index, lastLog.Term) + + // Attempt to restore a snapshot if there are any. + if err := r.restoreSnapshot(); err != nil { + return nil, err + } + + // Scan through the log for any configuration change entries. + snapshotIndex, _ := r.getLastSnapshot() + for index := snapshotIndex + 1; index <= lastLog.Index; index++ { + var entry Log + if err := r.logs.GetLog(index, &entry); err != nil { + r.logger.Printf("[ERR] raft: Failed to get log at %d: %v", index, err) + panic(err) + } + r.processConfigurationLogEntry(&entry) + } + r.logger.Printf("[INFO] raft: Initial configuration (index=%d): %+v", + r.configurations.latestIndex, r.configurations.latest.Servers) + + // Setup a heartbeat fast-path to avoid head-of-line + // blocking where possible. It MUST be safe for this + // to be called concurrently with a blocking RPC. + trans.SetHeartbeatHandler(r.processHeartbeat) + + // Start the background work. + r.goFunc(r.run) + r.goFunc(r.runFSM) + r.goFunc(r.runSnapshots) + return r, nil +} + +// restoreSnapshot attempts to restore the latest snapshots, and fails if none +// of them can be restored. This is called at initialization time, and is +// completely unsafe to call at any other time. +func (r *Raft) restoreSnapshot() error { + snapshots, err := r.snapshots.List() + if err != nil { + r.logger.Printf("[ERR] raft: Failed to list snapshots: %v", err) + return err + } + + // Try to load in order of newest to oldest + for _, snapshot := range snapshots { + _, source, err := r.snapshots.Open(snapshot.ID) + if err != nil { + r.logger.Printf("[ERR] raft: Failed to open snapshot %v: %v", snapshot.ID, err) + continue + } + defer source.Close() + + if err := r.fsm.Restore(source); err != nil { + r.logger.Printf("[ERR] raft: Failed to restore snapshot %v: %v", snapshot.ID, err) + continue + } + + // Log success + r.logger.Printf("[INFO] raft: Restored from snapshot %v", snapshot.ID) + + // Update the lastApplied so we don't replay old logs + r.setLastApplied(snapshot.Index) + + // Update the last stable snapshot info + r.setLastSnapshot(snapshot.Index, snapshot.Term) + + // Update the configuration + if snapshot.Version > 0 { + r.configurations.committed = snapshot.Configuration + r.configurations.committedIndex = snapshot.ConfigurationIndex + r.configurations.latest = snapshot.Configuration + r.configurations.latestIndex = snapshot.ConfigurationIndex + } else { + configuration := decodePeers(snapshot.Peers, r.trans) + r.configurations.committed = configuration + r.configurations.committedIndex = snapshot.Index + r.configurations.latest = configuration + r.configurations.latestIndex = snapshot.Index + } + + // Success! + return nil + } + + // If we had snapshots and failed to load them, its an error + if len(snapshots) > 0 { + return fmt.Errorf("failed to load any existing snapshots") + } + return nil +} + +// BootstrapCluster is equivalent to non-member BootstrapCluster but can be +// called on an un-bootstrapped Raft instance after it has been created. This +// should only be called at the beginning of time for the cluster, and you +// absolutely must make sure that you call it with the same configuration on all +// the Voter servers. There is no need to bootstrap Nonvoter and Staging +// servers. +func (r *Raft) BootstrapCluster(configuration Configuration) Future { + bootstrapReq := &bootstrapFuture{} + bootstrapReq.init() + bootstrapReq.configuration = configuration + select { + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + case r.bootstrapCh <- bootstrapReq: + return bootstrapReq + } +} + +// Leader is used to return the current leader of the cluster. +// It may return empty string if there is no current leader +// or the leader is unknown. +func (r *Raft) Leader() ServerAddress { + r.leaderLock.RLock() + leader := r.leader + r.leaderLock.RUnlock() + return leader +} + +// Apply is used to apply a command to the FSM in a highly consistent +// manner. This returns a future that can be used to wait on the application. +// An optional timeout can be provided to limit the amount of time we wait +// for the command to be started. This must be run on the leader or it +// will fail. +func (r *Raft) Apply(cmd []byte, timeout time.Duration) ApplyFuture { + metrics.IncrCounter([]string{"raft", "apply"}, 1) + var timer <-chan time.Time + if timeout > 0 { + timer = time.After(timeout) + } + + // Create a log future, no index or term yet + logFuture := &logFuture{ + log: Log{ + Type: LogCommand, + Data: cmd, + }, + } + logFuture.init() + + select { + case <-timer: + return errorFuture{ErrEnqueueTimeout} + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + case r.applyCh <- logFuture: + return logFuture + } +} + +// Barrier is used to issue a command that blocks until all preceeding +// operations have been applied to the FSM. It can be used to ensure the +// FSM reflects all queued writes. An optional timeout can be provided to +// limit the amount of time we wait for the command to be started. This +// must be run on the leader or it will fail. +func (r *Raft) Barrier(timeout time.Duration) Future { + metrics.IncrCounter([]string{"raft", "barrier"}, 1) + var timer <-chan time.Time + if timeout > 0 { + timer = time.After(timeout) + } + + // Create a log future, no index or term yet + logFuture := &logFuture{ + log: Log{ + Type: LogBarrier, + }, + } + logFuture.init() + + select { + case <-timer: + return errorFuture{ErrEnqueueTimeout} + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + case r.applyCh <- logFuture: + return logFuture + } +} + +// VerifyLeader is used to ensure the current node is still +// the leader. This can be done to prevent stale reads when a +// new leader has potentially been elected. +func (r *Raft) VerifyLeader() Future { + metrics.IncrCounter([]string{"raft", "verify_leader"}, 1) + verifyFuture := &verifyFuture{} + verifyFuture.init() + select { + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + case r.verifyCh <- verifyFuture: + return verifyFuture + } +} + +// GetConfiguration returns the latest configuration and its associated index +// currently in use. This may not yet be committed. This must not be called on +// the main thread (which can access the information directly). +func (r *Raft) GetConfiguration() ConfigurationFuture { + configReq := &configurationsFuture{} + configReq.init() + select { + case <-r.shutdownCh: + configReq.respond(ErrRaftShutdown) + return configReq + case r.configurationsCh <- configReq: + return configReq + } +} + +// AddPeer (deprecated) is used to add a new peer into the cluster. This must be +// run on the leader or it will fail. Use AddVoter/AddNonvoter instead. +func (r *Raft) AddPeer(peer ServerAddress) Future { + if r.protocolVersion > 2 { + return errorFuture{ErrUnsupportedProtocol} + } + + return r.requestConfigChange(configurationChangeRequest{ + command: AddStaging, + serverID: ServerID(peer), + serverAddress: peer, + prevIndex: 0, + }, 0) +} + +// RemovePeer (deprecated) is used to remove a peer from the cluster. If the +// current leader is being removed, it will cause a new election +// to occur. This must be run on the leader or it will fail. +// Use RemoveServer instead. +func (r *Raft) RemovePeer(peer ServerAddress) Future { + if r.protocolVersion > 2 { + return errorFuture{ErrUnsupportedProtocol} + } + + return r.requestConfigChange(configurationChangeRequest{ + command: RemoveServer, + serverID: ServerID(peer), + prevIndex: 0, + }, 0) +} + +// AddVoter will add the given server to the cluster as a staging server. If the +// server is already in the cluster as a voter, this does nothing. This must be +// run on the leader or it will fail. The leader will promote the staging server +// to a voter once that server is ready. If nonzero, prevIndex is the index of +// the only configuration upon which this change may be applied; if another +// configuration entry has been added in the meantime, this request will fail. +// If nonzero, timeout is how long this server should wait before the +// configuration change log entry is appended. +func (r *Raft) AddVoter(id ServerID, address ServerAddress, prevIndex uint64, timeout time.Duration) IndexFuture { + if r.protocolVersion < 2 { + return errorFuture{ErrUnsupportedProtocol} + } + + return r.requestConfigChange(configurationChangeRequest{ + command: AddStaging, + serverID: id, + serverAddress: address, + prevIndex: prevIndex, + }, timeout) +} + +// AddNonvoter will add the given server to the cluster but won't assign it a +// vote. The server will receive log entries, but it won't participate in +// elections or log entry commitment. If the server is already in the cluster as +// a staging server or voter, this does nothing. This must be run on the leader +// or it will fail. For prevIndex and timeout, see AddVoter. +func (r *Raft) AddNonvoter(id ServerID, address ServerAddress, prevIndex uint64, timeout time.Duration) IndexFuture { + if r.protocolVersion < 3 { + return errorFuture{ErrUnsupportedProtocol} + } + + return r.requestConfigChange(configurationChangeRequest{ + command: AddNonvoter, + serverID: id, + serverAddress: address, + prevIndex: prevIndex, + }, timeout) +} + +// RemoveServer will remove the given server from the cluster. If the current +// leader is being removed, it will cause a new election to occur. This must be +// run on the leader or it will fail. For prevIndex and timeout, see AddVoter. +func (r *Raft) RemoveServer(id ServerID, prevIndex uint64, timeout time.Duration) IndexFuture { + if r.protocolVersion < 2 { + return errorFuture{ErrUnsupportedProtocol} + } + + return r.requestConfigChange(configurationChangeRequest{ + command: RemoveServer, + serverID: id, + prevIndex: prevIndex, + }, timeout) +} + +// DemoteVoter will take away a server's vote, if it has one. If present, the +// server will continue to receive log entries, but it won't participate in +// elections or log entry commitment. If the server is not in the cluster, this +// does nothing. This must be run on the leader or it will fail. For prevIndex +// and timeout, see AddVoter. +func (r *Raft) DemoteVoter(id ServerID, prevIndex uint64, timeout time.Duration) IndexFuture { + if r.protocolVersion < 3 { + return errorFuture{ErrUnsupportedProtocol} + } + + return r.requestConfigChange(configurationChangeRequest{ + command: DemoteVoter, + serverID: id, + prevIndex: prevIndex, + }, timeout) +} + +// Shutdown is used to stop the Raft background routines. +// This is not a graceful operation. Provides a future that +// can be used to block until all background routines have exited. +func (r *Raft) Shutdown() Future { + r.shutdownLock.Lock() + defer r.shutdownLock.Unlock() + + if !r.shutdown { + close(r.shutdownCh) + r.shutdown = true + r.setState(Shutdown) + return &shutdownFuture{r} + } + + // avoid closing transport twice + return &shutdownFuture{nil} +} + +// Snapshot is used to manually force Raft to take a snapshot. +// Returns a future that can be used to block until complete. +func (r *Raft) Snapshot() Future { + snapFuture := &snapshotFuture{} + snapFuture.init() + select { + case r.snapshotCh <- snapFuture: + return snapFuture + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + } + +} + +// State is used to return the current raft state. +func (r *Raft) State() RaftState { + return r.getState() +} + +// LeaderCh is used to get a channel which delivers signals on +// acquiring or losing leadership. It sends true if we become +// the leader, and false if we lose it. The channel is not buffered, +// and does not block on writes. +func (r *Raft) LeaderCh() <-chan bool { + return r.leaderCh +} + +// String returns a string representation of this Raft node. +func (r *Raft) String() string { + return fmt.Sprintf("Node at %s [%v]", r.localAddr, r.getState()) +} + +// LastContact returns the time of last contact by a leader. +// This only makes sense if we are currently a follower. +func (r *Raft) LastContact() time.Time { + r.lastContactLock.RLock() + last := r.lastContact + r.lastContactLock.RUnlock() + return last +} + +// Stats is used to return a map of various internal stats. This +// should only be used for informative purposes or debugging. +// +// Keys are: "state", "term", "last_log_index", "last_log_term", +// "commit_index", "applied_index", "fsm_pending", +// "last_snapshot_index", "last_snapshot_term", +// "latest_configuration", "last_contact", and "num_peers". +// +// The value of "state" is a numerical value representing a +// RaftState const. +// +// The value of "latest_configuration" is a string which contains +// the id of each server, its suffrage status, and its address. +// +// The value of "last_contact" is either "never" if there +// has been no contact with a leader, "0" if the node is in the +// leader state, or the time since last contact with a leader +// formatted as a string. +// +// The value of "num_peers" is the number of other voting servers in the +// cluster, not including this node. If this node isn't part of the +// configuration then this will be "0". +// +// All other values are uint64s, formatted as strings. +func (r *Raft) Stats() map[string]string { + toString := func(v uint64) string { + return strconv.FormatUint(v, 10) + } + lastLogIndex, lastLogTerm := r.getLastLog() + lastSnapIndex, lastSnapTerm := r.getLastSnapshot() + s := map[string]string{ + "state": r.getState().String(), + "term": toString(r.getCurrentTerm()), + "last_log_index": toString(lastLogIndex), + "last_log_term": toString(lastLogTerm), + "commit_index": toString(r.getCommitIndex()), + "applied_index": toString(r.getLastApplied()), + "fsm_pending": toString(uint64(len(r.fsmCommitCh))), + "last_snapshot_index": toString(lastSnapIndex), + "last_snapshot_term": toString(lastSnapTerm), + "protocol_version": toString(uint64(r.protocolVersion)), + "protocol_version_min": toString(uint64(ProtocolVersionMin)), + "protocol_version_max": toString(uint64(ProtocolVersionMax)), + "snapshot_version_min": toString(uint64(SnapshotVersionMin)), + "snapshot_version_max": toString(uint64(SnapshotVersionMax)), + } + + future := r.GetConfiguration() + if err := future.Error(); err != nil { + r.logger.Printf("[WARN] raft: could not get configuration for Stats: %v", err) + } else { + configuration := future.Configuration() + s["latest_configuration_index"] = toString(future.Index()) + s["latest_configuration"] = fmt.Sprintf("%+v", configuration.Servers) + + // This is a legacy metric that we've seen people use in the wild. + hasUs := false + numPeers := 0 + for _, server := range configuration.Servers { + if server.Suffrage == Voter { + if server.ID == r.localID { + hasUs = true + } else { + numPeers++ + } + } + } + if !hasUs { + numPeers = 0 + } + s["num_peers"] = toString(uint64(numPeers)) + } + + last := r.LastContact() + if last.IsZero() { + s["last_contact"] = "never" + } else if r.getState() == Leader { + s["last_contact"] = "0" + } else { + s["last_contact"] = fmt.Sprintf("%v", time.Now().Sub(last)) + } + return s +} + +// LastIndex returns the last index in stable storage, +// either from the last log or from the last snapshot. +func (r *Raft) LastIndex() uint64 { + return r.getLastIndex() +} + +// AppliedIndex returns the last index applied to the FSM. This is generally +// lagging behind the last index, especially for indexes that are persisted but +// have not yet been considered committed by the leader. NOTE - this reflects +// the last index that was sent to the application's FSM over the apply channel +// but DOES NOT mean that the application's FSM has yet consumed it and applied +// it to its internal state. Thus, the application's state may lag behind this +// index. +func (r *Raft) AppliedIndex() uint64 { + return r.getLastApplied() +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/commands.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/commands.go new file mode 100644 index 00000000000..5d89e7bcdb1 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/commands.go @@ -0,0 +1,151 @@ +package raft + +// RPCHeader is a common sub-structure used to pass along protocol version and +// other information about the cluster. For older Raft implementations before +// versioning was added this will default to a zero-valued structure when read +// by newer Raft versions. +type RPCHeader struct { + // ProtocolVersion is the version of the protocol the sender is + // speaking. + ProtocolVersion ProtocolVersion +} + +// WithRPCHeader is an interface that exposes the RPC header. +type WithRPCHeader interface { + GetRPCHeader() RPCHeader +} + +// AppendEntriesRequest is the command used to append entries to the +// replicated log. +type AppendEntriesRequest struct { + RPCHeader + + // Provide the current term and leader + Term uint64 + Leader []byte + + // Provide the previous entries for integrity checking + PrevLogEntry uint64 + PrevLogTerm uint64 + + // New entries to commit + Entries []*Log + + // Commit index on the leader + LeaderCommitIndex uint64 +} + +// See WithRPCHeader. +func (r *AppendEntriesRequest) GetRPCHeader() RPCHeader { + return r.RPCHeader +} + +// AppendEntriesResponse is the response returned from an +// AppendEntriesRequest. +type AppendEntriesResponse struct { + RPCHeader + + // Newer term if leader is out of date + Term uint64 + + // Last Log is a hint to help accelerate rebuilding slow nodes + LastLog uint64 + + // We may not succeed if we have a conflicting entry + Success bool + + // There are scenarios where this request didn't succeed + // but there's no need to wait/back-off the next attempt. + NoRetryBackoff bool +} + +// See WithRPCHeader. +func (r *AppendEntriesResponse) GetRPCHeader() RPCHeader { + return r.RPCHeader +} + +// RequestVoteRequest is the command used by a candidate to ask a Raft peer +// for a vote in an election. +type RequestVoteRequest struct { + RPCHeader + + // Provide the term and our id + Term uint64 + Candidate []byte + + // Used to ensure safety + LastLogIndex uint64 + LastLogTerm uint64 +} + +// See WithRPCHeader. +func (r *RequestVoteRequest) GetRPCHeader() RPCHeader { + return r.RPCHeader +} + +// RequestVoteResponse is the response returned from a RequestVoteRequest. +type RequestVoteResponse struct { + RPCHeader + + // Newer term if leader is out of date. + Term uint64 + + // Peers is deprecated, but required by servers that only understand + // protocol version 0. This is not populated in protocol version 2 + // and later. + Peers []byte + + // Is the vote granted. + Granted bool +} + +// See WithRPCHeader. +func (r *RequestVoteResponse) GetRPCHeader() RPCHeader { + return r.RPCHeader +} + +// InstallSnapshotRequest is the command sent to a Raft peer to bootstrap its +// log (and state machine) from a snapshot on another peer. +type InstallSnapshotRequest struct { + RPCHeader + SnapshotVersion SnapshotVersion + + Term uint64 + Leader []byte + + // These are the last index/term included in the snapshot + LastLogIndex uint64 + LastLogTerm uint64 + + // Peer Set in the snapshot. This is deprecated in favor of Configuration + // but remains here in case we receive an InstallSnapshot from a leader + // that's running old code. + Peers []byte + + // Cluster membership. + Configuration []byte + // Log index where 'Configuration' entry was originally written. + ConfigurationIndex uint64 + + // Size of the snapshot + Size int64 +} + +// See WithRPCHeader. +func (r *InstallSnapshotRequest) GetRPCHeader() RPCHeader { + return r.RPCHeader +} + +// InstallSnapshotResponse is the response returned from an +// InstallSnapshotRequest. +type InstallSnapshotResponse struct { + RPCHeader + + Term uint64 + Success bool +} + +// See WithRPCHeader. +func (r *InstallSnapshotResponse) GetRPCHeader() RPCHeader { + return r.RPCHeader +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/commitment.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/commitment.go new file mode 100644 index 00000000000..b5ba2634ef2 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/commitment.go @@ -0,0 +1,101 @@ +package raft + +import ( + "sort" + "sync" +) + +// Commitment is used to advance the leader's commit index. The leader and +// replication goroutines report in newly written entries with Match(), and +// this notifies on commitCh when the commit index has advanced. +type commitment struct { + // protectes matchIndexes and commitIndex + sync.Mutex + // notified when commitIndex increases + commitCh chan struct{} + // voter ID to log index: the server stores up through this log entry + matchIndexes map[ServerID]uint64 + // a quorum stores up through this log entry. monotonically increases. + commitIndex uint64 + // the first index of this leader's term: this needs to be replicated to a + // majority of the cluster before this leader may mark anything committed + // (per Raft's commitment rule) + startIndex uint64 +} + +// newCommitment returns an commitment struct that notifies the provided +// channel when log entries have been committed. A new commitment struct is +// created each time this server becomes leader for a particular term. +// 'configuration' is the servers in the cluster. +// 'startIndex' is the first index created in this term (see +// its description above). +func newCommitment(commitCh chan struct{}, configuration Configuration, startIndex uint64) *commitment { + matchIndexes := make(map[ServerID]uint64) + for _, server := range configuration.Servers { + if server.Suffrage == Voter { + matchIndexes[server.ID] = 0 + } + } + return &commitment{ + commitCh: commitCh, + matchIndexes: matchIndexes, + commitIndex: 0, + startIndex: startIndex, + } +} + +// Called when a new cluster membership configuration is created: it will be +// used to determine commitment from now on. 'configuration' is the servers in +// the cluster. +func (c *commitment) setConfiguration(configuration Configuration) { + c.Lock() + defer c.Unlock() + oldMatchIndexes := c.matchIndexes + c.matchIndexes = make(map[ServerID]uint64) + for _, server := range configuration.Servers { + if server.Suffrage == Voter { + c.matchIndexes[server.ID] = oldMatchIndexes[server.ID] // defaults to 0 + } + } + c.recalculate() +} + +// Called by leader after commitCh is notified +func (c *commitment) getCommitIndex() uint64 { + c.Lock() + defer c.Unlock() + return c.commitIndex +} + +// Match is called once a server completes writing entries to disk: either the +// leader has written the new entry or a follower has replied to an +// AppendEntries RPC. The given server's disk agrees with this server's log up +// through the given index. +func (c *commitment) match(server ServerID, matchIndex uint64) { + c.Lock() + defer c.Unlock() + if prev, hasVote := c.matchIndexes[server]; hasVote && matchIndex > prev { + c.matchIndexes[server] = matchIndex + c.recalculate() + } +} + +// Internal helper to calculate new commitIndex from matchIndexes. +// Must be called with lock held. +func (c *commitment) recalculate() { + if len(c.matchIndexes) == 0 { + return + } + + matched := make([]uint64, 0, len(c.matchIndexes)) + for _, idx := range c.matchIndexes { + matched = append(matched, idx) + } + sort.Sort(uint64Slice(matched)) + quorumMatchIndex := matched[(len(matched)-1)/2] + + if quorumMatchIndex > c.commitIndex && quorumMatchIndex >= c.startIndex { + c.commitIndex = quorumMatchIndex + asyncNotifyCh(c.commitCh) + } +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/config.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/config.go new file mode 100644 index 00000000000..c1ce03ac22b --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/config.go @@ -0,0 +1,258 @@ +package raft + +import ( + "fmt" + "io" + "log" + "time" +) + +// These are the versions of the protocol (which includes RPC messages as +// well as Raft-specific log entries) that this server can _understand_. Use +// the ProtocolVersion member of the Config object to control the version of +// the protocol to use when _speaking_ to other servers. Note that depending on +// the protocol version being spoken, some otherwise understood RPC messages +// may be refused. See dispositionRPC for details of this logic. +// +// There are notes about the upgrade path in the description of the versions +// below. If you are starting a fresh cluster then there's no reason not to +// jump right to the latest protocol version. If you need to interoperate with +// older, version 0 Raft servers you'll need to drive the cluster through the +// different versions in order. +// +// The version details are complicated, but here's a summary of what's required +// to get from a version 0 cluster to version 3: +// +// 1. In version N of your app that starts using the new Raft library with +// versioning, set ProtocolVersion to 1. +// 2. Make version N+1 of your app require version N as a prerequisite (all +// servers must be upgraded). For version N+1 of your app set ProtocolVersion +// to 2. +// 3. Similarly, make version N+2 of your app require version N+1 as a +// prerequisite. For version N+2 of your app, set ProtocolVersion to 3. +// +// During this upgrade, older cluster members will still have Server IDs equal +// to their network addresses. To upgrade an older member and give it an ID, it +// needs to leave the cluster and re-enter: +// +// 1. Remove the server from the cluster with RemoveServer, using its network +// address as its ServerID. +// 2. Update the server's config to a better ID (restarting the server). +// 3. Add the server back to the cluster with AddVoter, using its new ID. +// +// You can do this during the rolling upgrade from N+1 to N+2 of your app, or +// as a rolling change at any time after the upgrade. +// +// Version History +// +// 0: Original Raft library before versioning was added. Servers running this +// version of the Raft library use AddPeerDeprecated/RemovePeerDeprecated +// for all configuration changes, and have no support for LogConfiguration. +// 1: First versioned protocol, used to interoperate with old servers, and begin +// the migration path to newer versions of the protocol. Under this version +// all configuration changes are propagated using the now-deprecated +// RemovePeerDeprecated Raft log entry. This means that server IDs are always +// set to be the same as the server addresses (since the old log entry type +// cannot transmit an ID), and only AddPeer/RemovePeer APIs are supported. +// Servers running this version of the protocol can understand the new +// LogConfiguration Raft log entry but will never generate one so they can +// remain compatible with version 0 Raft servers in the cluster. +// 2: Transitional protocol used when migrating an existing cluster to the new +// server ID system. Server IDs are still set to be the same as server +// addresses, but all configuration changes are propagated using the new +// LogConfiguration Raft log entry type, which can carry full ID information. +// This version supports the old AddPeer/RemovePeer APIs as well as the new +// ID-based AddVoter/RemoveServer APIs which should be used when adding +// version 3 servers to the cluster later. This version sheds all +// interoperability with version 0 servers, but can interoperate with newer +// Raft servers running with protocol version 1 since they can understand the +// new LogConfiguration Raft log entry, and this version can still understand +// their RemovePeerDeprecated Raft log entries. We need this protocol version +// as an intermediate step between 1 and 3 so that servers will propagate the +// ID information that will come from newly-added (or -rolled) servers using +// protocol version 3, but since they are still using their address-based IDs +// from the previous step they will still be able to track commitments and +// their own voting status properly. If we skipped this step, servers would +// be started with their new IDs, but they wouldn't see themselves in the old +// address-based configuration, so none of the servers would think they had a +// vote. +// 3: Protocol adding full support for server IDs and new ID-based server APIs +// (AddVoter, AddNonvoter, etc.), old AddPeer/RemovePeer APIs are no longer +// supported. Version 2 servers should be swapped out by removing them from +// the cluster one-by-one and re-adding them with updated configuration for +// this protocol version, along with their server ID. The remove/add cycle +// is required to populate their server ID. Note that removing must be done +// by ID, which will be the old server's address. +type ProtocolVersion int + +const ( + ProtocolVersionMin ProtocolVersion = 0 + ProtocolVersionMax = 3 +) + +// These are versions of snapshots that this server can _understand_. Currently, +// it is always assumed that this server generates the latest version, though +// this may be changed in the future to include a configurable version. +// +// Version History +// +// 0: Original Raft library before versioning was added. The peers portion of +// these snapshots is encoded in the legacy format which requires decodePeers +// to parse. This version of snapshots should only be produced by the +// unversioned Raft library. +// 1: New format which adds support for a full configuration structure and its +// associated log index, with support for server IDs and non-voting server +// modes. To ease upgrades, this also includes the legacy peers structure but +// that will never be used by servers that understand version 1 snapshots. +// Since the original Raft library didn't enforce any versioning, we must +// include the legacy peers structure for this version, but we can deprecate +// it in the next snapshot version. +type SnapshotVersion int + +const ( + SnapshotVersionMin SnapshotVersion = 0 + SnapshotVersionMax = 1 +) + +// Config provides any necessary configuration for the Raft server. +type Config struct { + // ProtocolVersion allows a Raft server to inter-operate with older + // Raft servers running an older version of the code. This is used to + // version the wire protocol as well as Raft-specific log entries that + // the server uses when _speaking_ to other servers. There is currently + // no auto-negotiation of versions so all servers must be manually + // configured with compatible versions. See ProtocolVersionMin and + // ProtocolVersionMax for the versions of the protocol that this server + // can _understand_. + ProtocolVersion ProtocolVersion + + // HeartbeatTimeout specifies the time in follower state without + // a leader before we attempt an election. + HeartbeatTimeout time.Duration + + // ElectionTimeout specifies the time in candidate state without + // a leader before we attempt an election. + ElectionTimeout time.Duration + + // CommitTimeout controls the time without an Apply() operation + // before we heartbeat to ensure a timely commit. Due to random + // staggering, may be delayed as much as 2x this value. + CommitTimeout time.Duration + + // MaxAppendEntries controls the maximum number of append entries + // to send at once. We want to strike a balance between efficiency + // and avoiding waste if the follower is going to reject because of + // an inconsistent log. + MaxAppendEntries int + + // If we are a member of a cluster, and RemovePeer is invoked for the + // local node, then we forget all peers and transition into the follower state. + // If ShutdownOnRemove is is set, we additional shutdown Raft. Otherwise, + // we can become a leader of a cluster containing only this node. + ShutdownOnRemove bool + + // TrailingLogs controls how many logs we leave after a snapshot. This is + // used so that we can quickly replay logs on a follower instead of being + // forced to send an entire snapshot. + TrailingLogs uint64 + + // SnapshotInterval controls how often we check if we should perform a snapshot. + // We randomly stagger between this value and 2x this value to avoid the entire + // cluster from performing a snapshot at once. + SnapshotInterval time.Duration + + // SnapshotThreshold controls how many outstanding logs there must be before + // we perform a snapshot. This is to prevent excessive snapshots when we can + // just replay a small set of logs. + SnapshotThreshold uint64 + + // LeaderLeaseTimeout is used to control how long the "lease" lasts + // for being the leader without being able to contact a quorum + // of nodes. If we reach this interval without contact, we will + // step down as leader. + LeaderLeaseTimeout time.Duration + + // StartAsLeader forces Raft to start in the leader state. This should + // never be used except for testing purposes, as it can cause a split-brain. + StartAsLeader bool + + // The unique ID for this server across all time. When running with + // ProtocolVersion < 3, you must set this to be the same as the network + // address of your transport. + LocalID ServerID + + // NotifyCh is used to provide a channel that will be notified of leadership + // changes. Raft will block writing to this channel, so it should either be + // buffered or aggressively consumed. + NotifyCh chan<- bool + + // LogOutput is used as a sink for logs, unless Logger is specified. + // Defaults to os.Stderr. + LogOutput io.Writer + + // Logger is a user-provided logger. If nil, a logger writing to LogOutput + // is used. + Logger *log.Logger +} + +// DefaultConfig returns a Config with usable defaults. +func DefaultConfig() *Config { + return &Config{ + ProtocolVersion: ProtocolVersionMax, + HeartbeatTimeout: 1000 * time.Millisecond, + ElectionTimeout: 1000 * time.Millisecond, + CommitTimeout: 50 * time.Millisecond, + MaxAppendEntries: 64, + ShutdownOnRemove: true, + TrailingLogs: 10240, + SnapshotInterval: 120 * time.Second, + SnapshotThreshold: 8192, + LeaderLeaseTimeout: 500 * time.Millisecond, + } +} + +// ValidateConfig is used to validate a sane configuration +func ValidateConfig(config *Config) error { + // We don't actually support running as 0 in the library any more, but + // we do understand it. + protocolMin := ProtocolVersionMin + if protocolMin == 0 { + protocolMin = 1 + } + if config.ProtocolVersion < protocolMin || + config.ProtocolVersion > ProtocolVersionMax { + return fmt.Errorf("Protocol version %d must be >= %d and <= %d", + config.ProtocolVersion, protocolMin, ProtocolVersionMax) + } + if len(config.LocalID) == 0 { + return fmt.Errorf("LocalID cannot be empty") + } + if config.HeartbeatTimeout < 5*time.Millisecond { + return fmt.Errorf("Heartbeat timeout is too low") + } + if config.ElectionTimeout < 5*time.Millisecond { + return fmt.Errorf("Election timeout is too low") + } + if config.CommitTimeout < time.Millisecond { + return fmt.Errorf("Commit timeout is too low") + } + if config.MaxAppendEntries <= 0 { + return fmt.Errorf("MaxAppendEntries must be positive") + } + if config.MaxAppendEntries > 1024 { + return fmt.Errorf("MaxAppendEntries is too large") + } + if config.SnapshotInterval < 5*time.Millisecond { + return fmt.Errorf("Snapshot interval is too low") + } + if config.LeaderLeaseTimeout < 5*time.Millisecond { + return fmt.Errorf("Leader lease timeout is too low") + } + if config.LeaderLeaseTimeout > config.HeartbeatTimeout { + return fmt.Errorf("Leader lease timeout cannot be larger than heartbeat timeout") + } + if config.ElectionTimeout < config.HeartbeatTimeout { + return fmt.Errorf("Election timeout must be equal or greater than Heartbeat Timeout") + } + return nil +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/configuration.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/configuration.go new file mode 100644 index 00000000000..74508c5e530 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/configuration.go @@ -0,0 +1,343 @@ +package raft + +import "fmt" + +// ServerSuffrage determines whether a Server in a Configuration gets a vote. +type ServerSuffrage int + +// Note: Don't renumber these, since the numbers are written into the log. +const ( + // Voter is a server whose vote is counted in elections and whose match index + // is used in advancing the leader's commit index. + Voter ServerSuffrage = iota + // Nonvoter is a server that receives log entries but is not considered for + // elections or commitment purposes. + Nonvoter + // Staging is a server that acts like a nonvoter with one exception: once a + // staging server receives enough log entries to be sufficiently caught up to + // the leader's log, the leader will invoke a membership change to change + // the Staging server to a Voter. + Staging +) + +func (s ServerSuffrage) String() string { + switch s { + case Voter: + return "Voter" + case Nonvoter: + return "Nonvoter" + case Staging: + return "Staging" + } + return "ServerSuffrage" +} + +// ServerID is a unique string identifying a server for all time. +type ServerID string + +// ServerAddress is a network address for a server that a transport can contact. +type ServerAddress string + +// Server tracks the information about a single server in a configuration. +type Server struct { + // Suffrage determines whether the server gets a vote. + Suffrage ServerSuffrage + // ID is a unique string identifying this server for all time. + ID ServerID + // Address is its network address that a transport can contact. + Address ServerAddress +} + +// Configuration tracks which servers are in the cluster, and whether they have +// votes. This should include the local server, if it's a member of the cluster. +// The servers are listed no particular order, but each should only appear once. +// These entries are appended to the log during membership changes. +type Configuration struct { + Servers []Server +} + +// Clone makes a deep copy of a Configuration. +func (c *Configuration) Clone() (copy Configuration) { + copy.Servers = append(copy.Servers, c.Servers...) + return +} + +// ConfigurationChangeCommand is the different ways to change the cluster +// configuration. +type ConfigurationChangeCommand uint8 + +const ( + // AddStaging makes a server Staging unless its Voter. + AddStaging ConfigurationChangeCommand = iota + // AddNonvoter makes a server Nonvoter unless its Staging or Voter. + AddNonvoter + // DemoteVoter makes a server Nonvoter unless its absent. + DemoteVoter + // RemoveServer removes a server entirely from the cluster membership. + RemoveServer + // Promote is created automatically by a leader; it turns a Staging server + // into a Voter. + Promote +) + +func (c ConfigurationChangeCommand) String() string { + switch c { + case AddStaging: + return "AddStaging" + case AddNonvoter: + return "AddNonvoter" + case DemoteVoter: + return "DemoteVoter" + case RemoveServer: + return "RemoveServer" + case Promote: + return "Promote" + } + return "ConfigurationChangeCommand" +} + +// configurationChangeRequest describes a change that a leader would like to +// make to its current configuration. It's used only within a single server +// (never serialized into the log), as part of `configurationChangeFuture`. +type configurationChangeRequest struct { + command ConfigurationChangeCommand + serverID ServerID + serverAddress ServerAddress // only present for AddStaging, AddNonvoter + // prevIndex, if nonzero, is the index of the only configuration upon which + // this change may be applied; if another configuration entry has been + // added in the meantime, this request will fail. + prevIndex uint64 +} + +// configurations is state tracked on every server about its Configurations. +// Note that, per Diego's dissertation, there can be at most one uncommitted +// configuration at a time (the next configuration may not be created until the +// prior one has been committed). +// +// One downside to storing just two configurations is that if you try to take a +// snahpsot when your state machine hasn't yet applied the committedIndex, we +// have no record of the configuration that would logically fit into that +// snapshot. We disallow snapshots in that case now. An alternative approach, +// which LogCabin uses, is to track every configuration change in the +// log. +type configurations struct { + // committed is the latest configuration in the log/snapshot that has been + // committed (the one with the largest index). + committed Configuration + // committedIndex is the log index where 'committed' was written. + committedIndex uint64 + // latest is the latest configuration in the log/snapshot (may be committed + // or uncommitted) + latest Configuration + // latestIndex is the log index where 'latest' was written. + latestIndex uint64 +} + +// Clone makes a deep copy of a configurations object. +func (c *configurations) Clone() (copy configurations) { + copy.committed = c.committed.Clone() + copy.committedIndex = c.committedIndex + copy.latest = c.latest.Clone() + copy.latestIndex = c.latestIndex + return +} + +// hasVote returns true if the server identified by 'id' is a Voter in the +// provided Configuration. +func hasVote(configuration Configuration, id ServerID) bool { + for _, server := range configuration.Servers { + if server.ID == id { + return server.Suffrage == Voter + } + } + return false +} + +// checkConfiguration tests a cluster membership configuration for common +// errors. +func checkConfiguration(configuration Configuration) error { + idSet := make(map[ServerID]bool) + addressSet := make(map[ServerAddress]bool) + var voters int + for _, server := range configuration.Servers { + if server.ID == "" { + return fmt.Errorf("Empty ID in configuration: %v", configuration) + } + if server.Address == "" { + return fmt.Errorf("Empty address in configuration: %v", server) + } + if idSet[server.ID] { + return fmt.Errorf("Found duplicate ID in configuration: %v", server.ID) + } + idSet[server.ID] = true + if addressSet[server.Address] { + return fmt.Errorf("Found duplicate address in configuration: %v", server.Address) + } + addressSet[server.Address] = true + if server.Suffrage == Voter { + voters++ + } + } + if voters == 0 { + return fmt.Errorf("Need at least one voter in configuration: %v", configuration) + } + return nil +} + +// nextConfiguration generates a new Configuration from the current one and a +// configuration change request. It's split from appendConfigurationEntry so +// that it can be unit tested easily. +func nextConfiguration(current Configuration, currentIndex uint64, change configurationChangeRequest) (Configuration, error) { + if change.prevIndex > 0 && change.prevIndex != currentIndex { + return Configuration{}, fmt.Errorf("Configuration changed since %v (latest is %v)", change.prevIndex, currentIndex) + } + + configuration := current.Clone() + switch change.command { + case AddStaging: + // TODO: barf on new address? + newServer := Server{ + // TODO: This should add the server as Staging, to be automatically + // promoted to Voter later. However, the promoton to Voter is not yet + // implemented, and doing so is not trivial with the way the leader loop + // coordinates with the replication goroutines today. So, for now, the + // server will have a vote right away, and the Promote case below is + // unused. + Suffrage: Voter, + ID: change.serverID, + Address: change.serverAddress, + } + found := false + for i, server := range configuration.Servers { + if server.ID == change.serverID { + if server.Suffrage == Voter { + configuration.Servers[i].Address = change.serverAddress + } else { + configuration.Servers[i] = newServer + } + found = true + break + } + } + if !found { + configuration.Servers = append(configuration.Servers, newServer) + } + case AddNonvoter: + newServer := Server{ + Suffrage: Nonvoter, + ID: change.serverID, + Address: change.serverAddress, + } + found := false + for i, server := range configuration.Servers { + if server.ID == change.serverID { + if server.Suffrage != Nonvoter { + configuration.Servers[i].Address = change.serverAddress + } else { + configuration.Servers[i] = newServer + } + found = true + break + } + } + if !found { + configuration.Servers = append(configuration.Servers, newServer) + } + case DemoteVoter: + for i, server := range configuration.Servers { + if server.ID == change.serverID { + configuration.Servers[i].Suffrage = Nonvoter + break + } + } + case RemoveServer: + for i, server := range configuration.Servers { + if server.ID == change.serverID { + configuration.Servers = append(configuration.Servers[:i], configuration.Servers[i+1:]...) + break + } + } + case Promote: + for i, server := range configuration.Servers { + if server.ID == change.serverID && server.Suffrage == Staging { + configuration.Servers[i].Suffrage = Voter + break + } + } + } + + // Make sure we didn't do something bad like remove the last voter + if err := checkConfiguration(configuration); err != nil { + return Configuration{}, err + } + + return configuration, nil +} + +// encodePeers is used to serialize a Configuration into the old peers format. +// This is here for backwards compatibility when operating with a mix of old +// servers and should be removed once we deprecate support for protocol version 1. +func encodePeers(configuration Configuration, trans Transport) []byte { + // Gather up all the voters, other suffrage types are not supported by + // this data format. + var encPeers [][]byte + for _, server := range configuration.Servers { + if server.Suffrage == Voter { + encPeers = append(encPeers, trans.EncodePeer(server.Address)) + } + } + + // Encode the entire array. + buf, err := encodeMsgPack(encPeers) + if err != nil { + panic(fmt.Errorf("failed to encode peers: %v", err)) + } + + return buf.Bytes() +} + +// decodePeers is used to deserialize an old list of peers into a Configuration. +// This is here for backwards compatibility with old log entries and snapshots; +// it should be removed eventually. +func decodePeers(buf []byte, trans Transport) Configuration { + // Decode the buffer first. + var encPeers [][]byte + if err := decodeMsgPack(buf, &encPeers); err != nil { + panic(fmt.Errorf("failed to decode peers: %v", err)) + } + + // Deserialize each peer. + var servers []Server + for _, enc := range encPeers { + p := trans.DecodePeer(enc) + servers = append(servers, Server{ + Suffrage: Voter, + ID: ServerID(p), + Address: ServerAddress(p), + }) + } + + return Configuration{ + Servers: servers, + } +} + +// encodeConfiguration serializes a Configuration using MsgPack, or panics on +// errors. +func encodeConfiguration(configuration Configuration) []byte { + buf, err := encodeMsgPack(configuration) + if err != nil { + panic(fmt.Errorf("failed to encode configuration: %v", err)) + } + return buf.Bytes() +} + +// decodeConfiguration deserializes a Configuration using MsgPack, or panics on +// errors. +func decodeConfiguration(buf []byte) Configuration { + var configuration Configuration + if err := decodeMsgPack(buf, &configuration); err != nil { + panic(fmt.Errorf("failed to decode configuration: %v", err)) + } + return configuration +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/discard_snapshot.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/discard_snapshot.go new file mode 100644 index 00000000000..5e93a9fe01f --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/discard_snapshot.go @@ -0,0 +1,49 @@ +package raft + +import ( + "fmt" + "io" +) + +// DiscardSnapshotStore is used to successfully snapshot while +// always discarding the snapshot. This is useful for when the +// log should be truncated but no snapshot should be retained. +// This should never be used for production use, and is only +// suitable for testing. +type DiscardSnapshotStore struct{} + +type DiscardSnapshotSink struct{} + +// NewDiscardSnapshotStore is used to create a new DiscardSnapshotStore. +func NewDiscardSnapshotStore() *DiscardSnapshotStore { + return &DiscardSnapshotStore{} +} + +func (d *DiscardSnapshotStore) Create(version SnapshotVersion, index, term uint64, + configuration Configuration, configurationIndex uint64, trans Transport) (SnapshotSink, error) { + return &DiscardSnapshotSink{}, nil +} + +func (d *DiscardSnapshotStore) List() ([]*SnapshotMeta, error) { + return nil, nil +} + +func (d *DiscardSnapshotStore) Open(id string) (*SnapshotMeta, io.ReadCloser, error) { + return nil, nil, fmt.Errorf("open is not supported") +} + +func (d *DiscardSnapshotSink) Write(b []byte) (int, error) { + return len(b), nil +} + +func (d *DiscardSnapshotSink) Close() error { + return nil +} + +func (d *DiscardSnapshotSink) ID() string { + return "discard" +} + +func (d *DiscardSnapshotSink) Cancel() error { + return nil +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/file_snapshot.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/file_snapshot.go new file mode 100644 index 00000000000..17d080134a3 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/file_snapshot.go @@ -0,0 +1,494 @@ +package raft + +import ( + "bufio" + "bytes" + "encoding/json" + "fmt" + "hash" + "hash/crc64" + "io" + "io/ioutil" + "log" + "os" + "path/filepath" + "sort" + "strings" + "time" +) + +const ( + testPath = "permTest" + snapPath = "snapshots" + metaFilePath = "meta.json" + stateFilePath = "state.bin" + tmpSuffix = ".tmp" +) + +// FileSnapshotStore implements the SnapshotStore interface and allows +// snapshots to be made on the local disk. +type FileSnapshotStore struct { + path string + retain int + logger *log.Logger +} + +type snapMetaSlice []*fileSnapshotMeta + +// FileSnapshotSink implements SnapshotSink with a file. +type FileSnapshotSink struct { + store *FileSnapshotStore + logger *log.Logger + dir string + meta fileSnapshotMeta + + stateFile *os.File + stateHash hash.Hash64 + buffered *bufio.Writer + + closed bool +} + +// fileSnapshotMeta is stored on disk. We also put a CRC +// on disk so that we can verify the snapshot. +type fileSnapshotMeta struct { + SnapshotMeta + CRC []byte +} + +// bufferedFile is returned when we open a snapshot. This way +// reads are buffered and the file still gets closed. +type bufferedFile struct { + bh *bufio.Reader + fh *os.File +} + +func (b *bufferedFile) Read(p []byte) (n int, err error) { + return b.bh.Read(p) +} + +func (b *bufferedFile) Close() error { + return b.fh.Close() +} + +// NewFileSnapshotStoreWithLogger creates a new FileSnapshotStore based +// on a base directory. The `retain` parameter controls how many +// snapshots are retained. Must be at least 1. +func NewFileSnapshotStoreWithLogger(base string, retain int, logger *log.Logger) (*FileSnapshotStore, error) { + if retain < 1 { + return nil, fmt.Errorf("must retain at least one snapshot") + } + if logger == nil { + logger = log.New(os.Stderr, "", log.LstdFlags) + } + + // Ensure our path exists + path := filepath.Join(base, snapPath) + if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { + return nil, fmt.Errorf("snapshot path not accessible: %v", err) + } + + // Setup the store + store := &FileSnapshotStore{ + path: path, + retain: retain, + logger: logger, + } + + // Do a permissions test + if err := store.testPermissions(); err != nil { + return nil, fmt.Errorf("permissions test failed: %v", err) + } + return store, nil +} + +// NewFileSnapshotStore creates a new FileSnapshotStore based +// on a base directory. The `retain` parameter controls how many +// snapshots are retained. Must be at least 1. +func NewFileSnapshotStore(base string, retain int, logOutput io.Writer) (*FileSnapshotStore, error) { + if logOutput == nil { + logOutput = os.Stderr + } + return NewFileSnapshotStoreWithLogger(base, retain, log.New(logOutput, "", log.LstdFlags)) +} + +// testPermissions tries to touch a file in our path to see if it works. +func (f *FileSnapshotStore) testPermissions() error { + path := filepath.Join(f.path, testPath) + fh, err := os.Create(path) + if err != nil { + return err + } + + if err = fh.Close(); err != nil { + return err + } + + if err = os.Remove(path); err != nil { + return err + } + return nil +} + +// snapshotName generates a name for the snapshot. +func snapshotName(term, index uint64) string { + now := time.Now() + msec := now.UnixNano() / int64(time.Millisecond) + return fmt.Sprintf("%d-%d-%d", term, index, msec) +} + +// Create is used to start a new snapshot +func (f *FileSnapshotStore) Create(version SnapshotVersion, index, term uint64, + configuration Configuration, configurationIndex uint64, trans Transport) (SnapshotSink, error) { + // We only support version 1 snapshots at this time. + if version != 1 { + return nil, fmt.Errorf("unsupported snapshot version %d", version) + } + + // Create a new path + name := snapshotName(term, index) + path := filepath.Join(f.path, name+tmpSuffix) + f.logger.Printf("[INFO] snapshot: Creating new snapshot at %s", path) + + // Make the directory + if err := os.MkdirAll(path, 0755); err != nil { + f.logger.Printf("[ERR] snapshot: Failed to make snapshot directory: %v", err) + return nil, err + } + + // Create the sink + sink := &FileSnapshotSink{ + store: f, + logger: f.logger, + dir: path, + meta: fileSnapshotMeta{ + SnapshotMeta: SnapshotMeta{ + Version: version, + ID: name, + Index: index, + Term: term, + Peers: encodePeers(configuration, trans), + Configuration: configuration, + ConfigurationIndex: configurationIndex, + }, + CRC: nil, + }, + } + + // Write out the meta data + if err := sink.writeMeta(); err != nil { + f.logger.Printf("[ERR] snapshot: Failed to write metadata: %v", err) + return nil, err + } + + // Open the state file + statePath := filepath.Join(path, stateFilePath) + fh, err := os.Create(statePath) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to create state file: %v", err) + return nil, err + } + sink.stateFile = fh + + // Create a CRC64 hash + sink.stateHash = crc64.New(crc64.MakeTable(crc64.ECMA)) + + // Wrap both the hash and file in a MultiWriter with buffering + multi := io.MultiWriter(sink.stateFile, sink.stateHash) + sink.buffered = bufio.NewWriter(multi) + + // Done + return sink, nil +} + +// List returns available snapshots in the store. +func (f *FileSnapshotStore) List() ([]*SnapshotMeta, error) { + // Get the eligible snapshots + snapshots, err := f.getSnapshots() + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to get snapshots: %v", err) + return nil, err + } + + var snapMeta []*SnapshotMeta + for _, meta := range snapshots { + snapMeta = append(snapMeta, &meta.SnapshotMeta) + if len(snapMeta) == f.retain { + break + } + } + return snapMeta, nil +} + +// getSnapshots returns all the known snapshots. +func (f *FileSnapshotStore) getSnapshots() ([]*fileSnapshotMeta, error) { + // Get the eligible snapshots + snapshots, err := ioutil.ReadDir(f.path) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to scan snapshot dir: %v", err) + return nil, err + } + + // Populate the metadata + var snapMeta []*fileSnapshotMeta + for _, snap := range snapshots { + // Ignore any files + if !snap.IsDir() { + continue + } + + // Ignore any temporary snapshots + dirName := snap.Name() + if strings.HasSuffix(dirName, tmpSuffix) { + f.logger.Printf("[WARN] snapshot: Found temporary snapshot: %v", dirName) + continue + } + + // Try to read the meta data + meta, err := f.readMeta(dirName) + if err != nil { + f.logger.Printf("[WARN] snapshot: Failed to read metadata for %v: %v", dirName, err) + continue + } + + // Make sure we can understand this version. + if meta.Version < SnapshotVersionMin || meta.Version > SnapshotVersionMax { + f.logger.Printf("[WARN] snapshot: Snapshot version for %v not supported: %d", dirName, meta.Version) + continue + } + + // Append, but only return up to the retain count + snapMeta = append(snapMeta, meta) + } + + // Sort the snapshot, reverse so we get new -> old + sort.Sort(sort.Reverse(snapMetaSlice(snapMeta))) + + return snapMeta, nil +} + +// readMeta is used to read the meta data for a given named backup +func (f *FileSnapshotStore) readMeta(name string) (*fileSnapshotMeta, error) { + // Open the meta file + metaPath := filepath.Join(f.path, name, metaFilePath) + fh, err := os.Open(metaPath) + if err != nil { + return nil, err + } + defer fh.Close() + + // Buffer the file IO + buffered := bufio.NewReader(fh) + + // Read in the JSON + meta := &fileSnapshotMeta{} + dec := json.NewDecoder(buffered) + if err := dec.Decode(meta); err != nil { + return nil, err + } + return meta, nil +} + +// Open takes a snapshot ID and returns a ReadCloser for that snapshot. +func (f *FileSnapshotStore) Open(id string) (*SnapshotMeta, io.ReadCloser, error) { + // Get the metadata + meta, err := f.readMeta(id) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to get meta data to open snapshot: %v", err) + return nil, nil, err + } + + // Open the state file + statePath := filepath.Join(f.path, id, stateFilePath) + fh, err := os.Open(statePath) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to open state file: %v", err) + return nil, nil, err + } + + // Create a CRC64 hash + stateHash := crc64.New(crc64.MakeTable(crc64.ECMA)) + + // Compute the hash + _, err = io.Copy(stateHash, fh) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to read state file: %v", err) + fh.Close() + return nil, nil, err + } + + // Verify the hash + computed := stateHash.Sum(nil) + if bytes.Compare(meta.CRC, computed) != 0 { + f.logger.Printf("[ERR] snapshot: CRC checksum failed (stored: %v computed: %v)", + meta.CRC, computed) + fh.Close() + return nil, nil, fmt.Errorf("CRC mismatch") + } + + // Seek to the start + if _, err := fh.Seek(0, 0); err != nil { + f.logger.Printf("[ERR] snapshot: State file seek failed: %v", err) + fh.Close() + return nil, nil, err + } + + // Return a buffered file + buffered := &bufferedFile{ + bh: bufio.NewReader(fh), + fh: fh, + } + + return &meta.SnapshotMeta, buffered, nil +} + +// ReapSnapshots reaps any snapshots beyond the retain count. +func (f *FileSnapshotStore) ReapSnapshots() error { + snapshots, err := f.getSnapshots() + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to get snapshots: %v", err) + return err + } + + for i := f.retain; i < len(snapshots); i++ { + path := filepath.Join(f.path, snapshots[i].ID) + f.logger.Printf("[INFO] snapshot: reaping snapshot %v", path) + if err := os.RemoveAll(path); err != nil { + f.logger.Printf("[ERR] snapshot: Failed to reap snapshot %v: %v", path, err) + return err + } + } + return nil +} + +// ID returns the ID of the snapshot, can be used with Open() +// after the snapshot is finalized. +func (s *FileSnapshotSink) ID() string { + return s.meta.ID +} + +// Write is used to append to the state file. We write to the +// buffered IO object to reduce the amount of context switches. +func (s *FileSnapshotSink) Write(b []byte) (int, error) { + return s.buffered.Write(b) +} + +// Close is used to indicate a successful end. +func (s *FileSnapshotSink) Close() error { + // Make sure close is idempotent + if s.closed { + return nil + } + s.closed = true + + // Close the open handles + if err := s.finalize(); err != nil { + s.logger.Printf("[ERR] snapshot: Failed to finalize snapshot: %v", err) + return err + } + + // Write out the meta data + if err := s.writeMeta(); err != nil { + s.logger.Printf("[ERR] snapshot: Failed to write metadata: %v", err) + return err + } + + // Move the directory into place + newPath := strings.TrimSuffix(s.dir, tmpSuffix) + if err := os.Rename(s.dir, newPath); err != nil { + s.logger.Printf("[ERR] snapshot: Failed to move snapshot into place: %v", err) + return err + } + + // Reap any old snapshots + if err := s.store.ReapSnapshots(); err != nil { + return err + } + + return nil +} + +// Cancel is used to indicate an unsuccessful end. +func (s *FileSnapshotSink) Cancel() error { + // Make sure close is idempotent + if s.closed { + return nil + } + s.closed = true + + // Close the open handles + if err := s.finalize(); err != nil { + s.logger.Printf("[ERR] snapshot: Failed to finalize snapshot: %v", err) + return err + } + + // Attempt to remove all artifacts + return os.RemoveAll(s.dir) +} + +// finalize is used to close all of our resources. +func (s *FileSnapshotSink) finalize() error { + // Flush any remaining data + if err := s.buffered.Flush(); err != nil { + return err + } + + // Get the file size + stat, statErr := s.stateFile.Stat() + + // Close the file + if err := s.stateFile.Close(); err != nil { + return err + } + + // Set the file size, check after we close + if statErr != nil { + return statErr + } + s.meta.Size = stat.Size() + + // Set the CRC + s.meta.CRC = s.stateHash.Sum(nil) + return nil +} + +// writeMeta is used to write out the metadata we have. +func (s *FileSnapshotSink) writeMeta() error { + // Open the meta file + metaPath := filepath.Join(s.dir, metaFilePath) + fh, err := os.Create(metaPath) + if err != nil { + return err + } + defer fh.Close() + + // Buffer the file IO + buffered := bufio.NewWriter(fh) + defer buffered.Flush() + + // Write out as JSON + enc := json.NewEncoder(buffered) + if err := enc.Encode(&s.meta); err != nil { + return err + } + return nil +} + +// Implement the sort interface for []*fileSnapshotMeta. +func (s snapMetaSlice) Len() int { + return len(s) +} + +func (s snapMetaSlice) Less(i, j int) bool { + if s[i].Term != s[j].Term { + return s[i].Term < s[j].Term + } + if s[i].Index != s[j].Index { + return s[i].Index < s[j].Index + } + return s[i].ID < s[j].ID +} + +func (s snapMetaSlice) Swap(i, j int) { + s[i], s[j] = s[j], s[i] +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/fsm.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/fsm.go new file mode 100644 index 00000000000..23da1e99b38 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/fsm.go @@ -0,0 +1,116 @@ +package raft + +import ( + "fmt" + "io" + "time" + + "github.com/armon/go-metrics" +) + +// FSM provides an interface that can be implemented by +// clients to make use of the replicated log. +type FSM interface { + // Apply log is invoked once a log entry is committed. + // It returns a value which will be made available in the + // ApplyFuture returned by Raft.Apply method if that + // method was called on the same Raft node as the FSM. + Apply(*Log) interface{} + + // Snapshot is used to support log compaction. This call should + // return an FSMSnapshot which can be used to save a point-in-time + // snapshot of the FSM. Apply and Snapshot are not called in multiple + // threads, but Apply will be called concurrently with Persist. This means + // the FSM should be implemented in a fashion that allows for concurrent + // updates while a snapshot is happening. + Snapshot() (FSMSnapshot, error) + + // Restore is used to restore an FSM from a snapshot. It is not called + // concurrently with any other command. The FSM must discard all previous + // state. + Restore(io.ReadCloser) error +} + +// FSMSnapshot is returned by an FSM in response to a Snapshot +// It must be safe to invoke FSMSnapshot methods with concurrent +// calls to Apply. +type FSMSnapshot interface { + // Persist should dump all necessary state to the WriteCloser 'sink', + // and call sink.Close() when finished or call sink.Cancel() on error. + Persist(sink SnapshotSink) error + + // Release is invoked when we are finished with the snapshot. + Release() +} + +// runFSM is a long running goroutine responsible for applying logs +// to the FSM. This is done async of other logs since we don't want +// the FSM to block our internal operations. +func (r *Raft) runFSM() { + var lastIndex, lastTerm uint64 + for { + select { + case req := <-r.fsmRestoreCh: + // Open the snapshot + meta, source, err := r.snapshots.Open(req.ID) + if err != nil { + req.respond(fmt.Errorf("failed to open snapshot %v: %v", req.ID, err)) + continue + } + + // Attempt to restore + start := time.Now() + if err := r.fsm.Restore(source); err != nil { + req.respond(fmt.Errorf("failed to restore snapshot %v: %v", req.ID, err)) + source.Close() + continue + } + source.Close() + metrics.MeasureSince([]string{"raft", "fsm", "restore"}, start) + + // Update the last index and term + lastIndex = meta.Index + lastTerm = meta.Term + req.respond(nil) + + case req := <-r.fsmSnapshotCh: + // Is there something to snapshot? + if lastIndex == 0 { + req.respond(ErrNothingNewToSnapshot) + continue + } + + // Start a snapshot + start := time.Now() + snap, err := r.fsm.Snapshot() + metrics.MeasureSince([]string{"raft", "fsm", "snapshot"}, start) + + // Respond to the request + req.index = lastIndex + req.term = lastTerm + req.snapshot = snap + req.respond(err) + + case commitEntry := <-r.fsmCommitCh: + // Apply the log if a command + var resp interface{} + if commitEntry.log.Type == LogCommand { + start := time.Now() + resp = r.fsm.Apply(commitEntry.log) + metrics.MeasureSince([]string{"raft", "fsm", "apply"}, start) + } + + // Update the indexes + lastIndex = commitEntry.log.Index + lastTerm = commitEntry.log.Term + + // Invoke the future if given + if commitEntry.future != nil { + commitEntry.future.response = resp + commitEntry.future.respond(nil) + } + case <-r.shutdownCh: + return + } + } +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/future.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/future.go new file mode 100644 index 00000000000..67c74fc42ee --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/future.go @@ -0,0 +1,245 @@ +package raft + +import ( + "sync" + "time" +) + +// Future is used to represent an action that may occur in the future. +type Future interface { + // Error blocks until the future arrives and then + // returns the error status of the future. + // This may be called any number of times - all + // calls will return the same value. + // Note that it is not OK to call this method + // twice concurrently on the same Future instance. + Error() error +} + +// IndexFuture is used for future actions that can result in a raft log entry +// being created. +type IndexFuture interface { + Future + + // Index holds the index of the newly applied log entry. + // This must not be called until after the Error method has returned. + Index() uint64 +} + +// ApplyFuture is used for Apply and can return the FSM response. +type ApplyFuture interface { + IndexFuture + + // Response returns the FSM response as returned + // by the FSM.Apply method. This must not be called + // until after the Error method has returned. + Response() interface{} +} + +// ConfigurationFuture is used for GetConfiguration and can return the +// latest configuration in use by Raft. +type ConfigurationFuture interface { + IndexFuture + + // Configuration contains the latest configuration. This must + // not be called until after the Error method has returned. + Configuration() Configuration +} + +// errorFuture is used to return a static error. +type errorFuture struct { + err error +} + +func (e errorFuture) Error() error { + return e.err +} + +func (e errorFuture) Response() interface{} { + return nil +} + +func (e errorFuture) Index() uint64 { + return 0 +} + +// deferError can be embedded to allow a future +// to provide an error in the future. +type deferError struct { + err error + errCh chan error + responded bool +} + +func (d *deferError) init() { + d.errCh = make(chan error, 1) +} + +func (d *deferError) Error() error { + if d.err != nil { + // Note that when we've received a nil error, this + // won't trigger, but the channel is closed after + // send so we'll still return nil below. + return d.err + } + if d.errCh == nil { + panic("waiting for response on nil channel") + } + d.err = <-d.errCh + return d.err +} + +func (d *deferError) respond(err error) { + if d.errCh == nil { + return + } + if d.responded { + return + } + d.errCh <- err + close(d.errCh) + d.responded = true +} + +// There are several types of requests that cause a configuration entry to +// be appended to the log. These are encoded here for leaderLoop() to process. +// This is internal to a single server. +type configurationChangeFuture struct { + logFuture + req configurationChangeRequest +} + +// bootstrapFuture is used to attempt a live bootstrap of the cluster. See the +// Raft object's BootstrapCluster member function for more details. +type bootstrapFuture struct { + deferError + + // configuration is the proposed bootstrap configuration to apply. + configuration Configuration +} + +// logFuture is used to apply a log entry and waits until +// the log is considered committed. +type logFuture struct { + deferError + log Log + response interface{} + dispatch time.Time +} + +func (l *logFuture) Response() interface{} { + return l.response +} + +func (l *logFuture) Index() uint64 { + return l.log.Index +} + +type shutdownFuture struct { + raft *Raft +} + +func (s *shutdownFuture) Error() error { + if s.raft == nil { + return nil + } + s.raft.waitShutdown() + if closeable, ok := s.raft.trans.(WithClose); ok { + closeable.Close() + } + return nil +} + +// snapshotFuture is used for waiting on a snapshot to complete. +type snapshotFuture struct { + deferError +} + +// reqSnapshotFuture is used for requesting a snapshot start. +// It is only used internally. +type reqSnapshotFuture struct { + deferError + + // snapshot details provided by the FSM runner before responding + index uint64 + term uint64 + snapshot FSMSnapshot +} + +// restoreFuture is used for requesting an FSM to perform a +// snapshot restore. Used internally only. +type restoreFuture struct { + deferError + ID string +} + +// verifyFuture is used to verify the current node is still +// the leader. This is to prevent a stale read. +type verifyFuture struct { + deferError + notifyCh chan *verifyFuture + quorumSize int + votes int + voteLock sync.Mutex +} + +// configurationsFuture is used to retrieve the current configurations. This is +// used to allow safe access to this information outside of the main thread. +type configurationsFuture struct { + deferError + configurations configurations +} + +// Configuration returns the latest configuration in use by Raft. +func (c *configurationsFuture) Configuration() Configuration { + return c.configurations.latest +} + +// Index returns the index of the latest configuration in use by Raft. +func (c *configurationsFuture) Index() uint64 { + return c.configurations.latestIndex +} + +// vote is used to respond to a verifyFuture. +// This may block when responding on the notifyCh. +func (v *verifyFuture) vote(leader bool) { + v.voteLock.Lock() + defer v.voteLock.Unlock() + + // Guard against having notified already + if v.notifyCh == nil { + return + } + + if leader { + v.votes++ + if v.votes >= v.quorumSize { + v.notifyCh <- v + v.notifyCh = nil + } + } else { + v.notifyCh <- v + v.notifyCh = nil + } +} + +// appendFuture is used for waiting on a pipelined append +// entries RPC. +type appendFuture struct { + deferError + start time.Time + args *AppendEntriesRequest + resp *AppendEntriesResponse +} + +func (a *appendFuture) Start() time.Time { + return a.start +} + +func (a *appendFuture) Request() *AppendEntriesRequest { + return a.args +} + +func (a *appendFuture) Response() *AppendEntriesResponse { + return a.resp +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/inmem_store.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/inmem_store.go new file mode 100644 index 00000000000..e5d579e1b31 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/inmem_store.go @@ -0,0 +1,125 @@ +package raft + +import ( + "sync" +) + +// InmemStore implements the LogStore and StableStore interface. +// It should NOT EVER be used for production. It is used only for +// unit tests. Use the MDBStore implementation instead. +type InmemStore struct { + l sync.RWMutex + lowIndex uint64 + highIndex uint64 + logs map[uint64]*Log + kv map[string][]byte + kvInt map[string]uint64 +} + +// NewInmemStore returns a new in-memory backend. Do not ever +// use for production. Only for testing. +func NewInmemStore() *InmemStore { + i := &InmemStore{ + logs: make(map[uint64]*Log), + kv: make(map[string][]byte), + kvInt: make(map[string]uint64), + } + return i +} + +// FirstIndex implements the LogStore interface. +func (i *InmemStore) FirstIndex() (uint64, error) { + i.l.RLock() + defer i.l.RUnlock() + return i.lowIndex, nil +} + +// LastIndex implements the LogStore interface. +func (i *InmemStore) LastIndex() (uint64, error) { + i.l.RLock() + defer i.l.RUnlock() + return i.highIndex, nil +} + +// GetLog implements the LogStore interface. +func (i *InmemStore) GetLog(index uint64, log *Log) error { + i.l.RLock() + defer i.l.RUnlock() + l, ok := i.logs[index] + if !ok { + return ErrLogNotFound + } + *log = *l + return nil +} + +// StoreLog implements the LogStore interface. +func (i *InmemStore) StoreLog(log *Log) error { + return i.StoreLogs([]*Log{log}) +} + +// StoreLogs implements the LogStore interface. +func (i *InmemStore) StoreLogs(logs []*Log) error { + i.l.Lock() + defer i.l.Unlock() + for _, l := range logs { + i.logs[l.Index] = l + if i.lowIndex == 0 { + i.lowIndex = l.Index + } + if l.Index > i.highIndex { + i.highIndex = l.Index + } + } + return nil +} + +// DeleteRange implements the LogStore interface. +func (i *InmemStore) DeleteRange(min, max uint64) error { + i.l.Lock() + defer i.l.Unlock() + for j := min; j <= max; j++ { + delete(i.logs, j) + } + if min <= i.lowIndex { + i.lowIndex = max + 1 + } + if max >= i.highIndex { + i.highIndex = min - 1 + } + if i.lowIndex > i.highIndex { + i.lowIndex = 0 + i.highIndex = 0 + } + return nil +} + +// Set implements the StableStore interface. +func (i *InmemStore) Set(key []byte, val []byte) error { + i.l.Lock() + defer i.l.Unlock() + i.kv[string(key)] = val + return nil +} + +// Get implements the StableStore interface. +func (i *InmemStore) Get(key []byte) ([]byte, error) { + i.l.RLock() + defer i.l.RUnlock() + return i.kv[string(key)], nil +} + +// SetUint64 implements the StableStore interface. +func (i *InmemStore) SetUint64(key []byte, val uint64) error { + i.l.Lock() + defer i.l.Unlock() + i.kvInt[string(key)] = val + return nil +} + +// GetUint64 implements the StableStore interface. +func (i *InmemStore) GetUint64(key []byte) (uint64, error) { + i.l.RLock() + defer i.l.RUnlock() + return i.kvInt[string(key)], nil +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/inmem_transport.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/inmem_transport.go new file mode 100644 index 00000000000..3693cd5ad1e --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/inmem_transport.go @@ -0,0 +1,322 @@ +package raft + +import ( + "fmt" + "io" + "sync" + "time" +) + +// NewInmemAddr returns a new in-memory addr with +// a randomly generate UUID as the ID. +func NewInmemAddr() ServerAddress { + return ServerAddress(generateUUID()) +} + +// inmemPipeline is used to pipeline requests for the in-mem transport. +type inmemPipeline struct { + trans *InmemTransport + peer *InmemTransport + peerAddr ServerAddress + + doneCh chan AppendFuture + inprogressCh chan *inmemPipelineInflight + + shutdown bool + shutdownCh chan struct{} + shutdownLock sync.Mutex +} + +type inmemPipelineInflight struct { + future *appendFuture + respCh <-chan RPCResponse +} + +// InmemTransport Implements the Transport interface, to allow Raft to be +// tested in-memory without going over a network. +type InmemTransport struct { + sync.RWMutex + consumerCh chan RPC + localAddr ServerAddress + peers map[ServerAddress]*InmemTransport + pipelines []*inmemPipeline + timeout time.Duration +} + +// NewInmemTransport is used to initialize a new transport +// and generates a random local address if none is specified +func NewInmemTransport(addr ServerAddress) (ServerAddress, *InmemTransport) { + if string(addr) == "" { + addr = NewInmemAddr() + } + trans := &InmemTransport{ + consumerCh: make(chan RPC, 16), + localAddr: addr, + peers: make(map[ServerAddress]*InmemTransport), + timeout: 50 * time.Millisecond, + } + return addr, trans +} + +// SetHeartbeatHandler is used to set optional fast-path for +// heartbeats, not supported for this transport. +func (i *InmemTransport) SetHeartbeatHandler(cb func(RPC)) { +} + +// Consumer implements the Transport interface. +func (i *InmemTransport) Consumer() <-chan RPC { + return i.consumerCh +} + +// LocalAddr implements the Transport interface. +func (i *InmemTransport) LocalAddr() ServerAddress { + return i.localAddr +} + +// AppendEntriesPipeline returns an interface that can be used to pipeline +// AppendEntries requests. +func (i *InmemTransport) AppendEntriesPipeline(target ServerAddress) (AppendPipeline, error) { + i.RLock() + peer, ok := i.peers[target] + i.RUnlock() + if !ok { + return nil, fmt.Errorf("failed to connect to peer: %v", target) + } + pipeline := newInmemPipeline(i, peer, target) + i.Lock() + i.pipelines = append(i.pipelines, pipeline) + i.Unlock() + return pipeline, nil +} + +// AppendEntries implements the Transport interface. +func (i *InmemTransport) AppendEntries(target ServerAddress, args *AppendEntriesRequest, resp *AppendEntriesResponse) error { + rpcResp, err := i.makeRPC(target, args, nil, i.timeout) + if err != nil { + return err + } + + // Copy the result back + out := rpcResp.Response.(*AppendEntriesResponse) + *resp = *out + return nil +} + +// RequestVote implements the Transport interface. +func (i *InmemTransport) RequestVote(target ServerAddress, args *RequestVoteRequest, resp *RequestVoteResponse) error { + rpcResp, err := i.makeRPC(target, args, nil, i.timeout) + if err != nil { + return err + } + + // Copy the result back + out := rpcResp.Response.(*RequestVoteResponse) + *resp = *out + return nil +} + +// InstallSnapshot implements the Transport interface. +func (i *InmemTransport) InstallSnapshot(target ServerAddress, args *InstallSnapshotRequest, resp *InstallSnapshotResponse, data io.Reader) error { + rpcResp, err := i.makeRPC(target, args, data, 10*i.timeout) + if err != nil { + return err + } + + // Copy the result back + out := rpcResp.Response.(*InstallSnapshotResponse) + *resp = *out + return nil +} + +func (i *InmemTransport) makeRPC(target ServerAddress, args interface{}, r io.Reader, timeout time.Duration) (rpcResp RPCResponse, err error) { + i.RLock() + peer, ok := i.peers[target] + i.RUnlock() + + if !ok { + err = fmt.Errorf("failed to connect to peer: %v", target) + return + } + + // Send the RPC over + respCh := make(chan RPCResponse) + peer.consumerCh <- RPC{ + Command: args, + Reader: r, + RespChan: respCh, + } + + // Wait for a response + select { + case rpcResp = <-respCh: + if rpcResp.Error != nil { + err = rpcResp.Error + } + case <-time.After(timeout): + err = fmt.Errorf("command timed out") + } + return +} + +// EncodePeer implements the Transport interface. +func (i *InmemTransport) EncodePeer(p ServerAddress) []byte { + return []byte(p) +} + +// DecodePeer implements the Transport interface. +func (i *InmemTransport) DecodePeer(buf []byte) ServerAddress { + return ServerAddress(buf) +} + +// Connect is used to connect this transport to another transport for +// a given peer name. This allows for local routing. +func (i *InmemTransport) Connect(peer ServerAddress, t Transport) { + trans := t.(*InmemTransport) + i.Lock() + defer i.Unlock() + i.peers[peer] = trans +} + +// Disconnect is used to remove the ability to route to a given peer. +func (i *InmemTransport) Disconnect(peer ServerAddress) { + i.Lock() + defer i.Unlock() + delete(i.peers, peer) + + // Disconnect any pipelines + n := len(i.pipelines) + for idx := 0; idx < n; idx++ { + if i.pipelines[idx].peerAddr == peer { + i.pipelines[idx].Close() + i.pipelines[idx], i.pipelines[n-1] = i.pipelines[n-1], nil + idx-- + n-- + } + } + i.pipelines = i.pipelines[:n] +} + +// DisconnectAll is used to remove all routes to peers. +func (i *InmemTransport) DisconnectAll() { + i.Lock() + defer i.Unlock() + i.peers = make(map[ServerAddress]*InmemTransport) + + // Handle pipelines + for _, pipeline := range i.pipelines { + pipeline.Close() + } + i.pipelines = nil +} + +// Close is used to permanently disable the transport +func (i *InmemTransport) Close() error { + i.DisconnectAll() + return nil +} + +func newInmemPipeline(trans *InmemTransport, peer *InmemTransport, addr ServerAddress) *inmemPipeline { + i := &inmemPipeline{ + trans: trans, + peer: peer, + peerAddr: addr, + doneCh: make(chan AppendFuture, 16), + inprogressCh: make(chan *inmemPipelineInflight, 16), + shutdownCh: make(chan struct{}), + } + go i.decodeResponses() + return i +} + +func (i *inmemPipeline) decodeResponses() { + timeout := i.trans.timeout + for { + select { + case inp := <-i.inprogressCh: + var timeoutCh <-chan time.Time + if timeout > 0 { + timeoutCh = time.After(timeout) + } + + select { + case rpcResp := <-inp.respCh: + // Copy the result back + *inp.future.resp = *rpcResp.Response.(*AppendEntriesResponse) + inp.future.respond(rpcResp.Error) + + select { + case i.doneCh <- inp.future: + case <-i.shutdownCh: + return + } + + case <-timeoutCh: + inp.future.respond(fmt.Errorf("command timed out")) + select { + case i.doneCh <- inp.future: + case <-i.shutdownCh: + return + } + + case <-i.shutdownCh: + return + } + case <-i.shutdownCh: + return + } + } +} + +func (i *inmemPipeline) AppendEntries(args *AppendEntriesRequest, resp *AppendEntriesResponse) (AppendFuture, error) { + // Create a new future + future := &appendFuture{ + start: time.Now(), + args: args, + resp: resp, + } + future.init() + + // Handle a timeout + var timeout <-chan time.Time + if i.trans.timeout > 0 { + timeout = time.After(i.trans.timeout) + } + + // Send the RPC over + respCh := make(chan RPCResponse, 1) + rpc := RPC{ + Command: args, + RespChan: respCh, + } + select { + case i.peer.consumerCh <- rpc: + case <-timeout: + return nil, fmt.Errorf("command enqueue timeout") + case <-i.shutdownCh: + return nil, ErrPipelineShutdown + } + + // Send to be decoded + select { + case i.inprogressCh <- &inmemPipelineInflight{future, respCh}: + return future, nil + case <-i.shutdownCh: + return nil, ErrPipelineShutdown + } +} + +func (i *inmemPipeline) Consumer() <-chan AppendFuture { + return i.doneCh +} + +func (i *inmemPipeline) Close() error { + i.shutdownLock.Lock() + defer i.shutdownLock.Unlock() + if i.shutdown { + return nil + } + + i.shutdown = true + close(i.shutdownCh) + return nil +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/log.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/log.go new file mode 100644 index 00000000000..4ade38ecc12 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/log.go @@ -0,0 +1,72 @@ +package raft + +// LogType describes various types of log entries. +type LogType uint8 + +const ( + // LogCommand is applied to a user FSM. + LogCommand LogType = iota + + // LogNoop is used to assert leadership. + LogNoop + + // LogAddPeer is used to add a new peer. This should only be used with + // older protocol versions designed to be compatible with unversioned + // Raft servers. See comments in config.go for details. + LogAddPeerDeprecated + + // LogRemovePeer is used to remove an existing peer. This should only be + // used with older protocol versions designed to be compatible with + // unversioned Raft servers. See comments in config.go for details. + LogRemovePeerDeprecated + + // LogBarrier is used to ensure all preceding operations have been + // applied to the FSM. It is similar to LogNoop, but instead of returning + // once committed, it only returns once the FSM manager acks it. Otherwise + // it is possible there are operations committed but not yet applied to + // the FSM. + LogBarrier + + // LogConfiguration establishes a membership change configuration. It is + // created when a server is added, removed, promoted, etc. Only used + // when protocol version 1 or greater is in use. + LogConfiguration +) + +// Log entries are replicated to all members of the Raft cluster +// and form the heart of the replicated state machine. +type Log struct { + // Index holds the index of the log entry. + Index uint64 + + // Term holds the election term of the log entry. + Term uint64 + + // Type holds the type of the log entry. + Type LogType + + // Data holds the log entry's type-specific data. + Data []byte +} + +// LogStore is used to provide an interface for storing +// and retrieving logs in a durable fashion. +type LogStore interface { + // FirstIndex returns the first index written. 0 for no entries. + FirstIndex() (uint64, error) + + // LastIndex returns the last index written. 0 for no entries. + LastIndex() (uint64, error) + + // GetLog gets a log entry at a given index. + GetLog(index uint64, log *Log) error + + // StoreLog stores a log entry. + StoreLog(log *Log) error + + // StoreLogs stores multiple log entries. + StoreLogs(logs []*Log) error + + // DeleteRange deletes a range of log entries. The range is inclusive. + DeleteRange(min, max uint64) error +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/log_cache.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/log_cache.go new file mode 100644 index 00000000000..952e98c2282 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/log_cache.go @@ -0,0 +1,79 @@ +package raft + +import ( + "fmt" + "sync" +) + +// LogCache wraps any LogStore implementation to provide an +// in-memory ring buffer. This is used to cache access to +// the recently written entries. For implementations that do not +// cache themselves, this can provide a substantial boost by +// avoiding disk I/O on recent entries. +type LogCache struct { + store LogStore + + cache []*Log + l sync.RWMutex +} + +// NewLogCache is used to create a new LogCache with the +// given capacity and backend store. +func NewLogCache(capacity int, store LogStore) (*LogCache, error) { + if capacity <= 0 { + return nil, fmt.Errorf("capacity must be positive") + } + c := &LogCache{ + store: store, + cache: make([]*Log, capacity), + } + return c, nil +} + +func (c *LogCache) GetLog(idx uint64, log *Log) error { + // Check the buffer for an entry + c.l.RLock() + cached := c.cache[idx%uint64(len(c.cache))] + c.l.RUnlock() + + // Check if entry is valid + if cached != nil && cached.Index == idx { + *log = *cached + return nil + } + + // Forward request on cache miss + return c.store.GetLog(idx, log) +} + +func (c *LogCache) StoreLog(log *Log) error { + return c.StoreLogs([]*Log{log}) +} + +func (c *LogCache) StoreLogs(logs []*Log) error { + // Insert the logs into the ring buffer + c.l.Lock() + for _, l := range logs { + c.cache[l.Index%uint64(len(c.cache))] = l + } + c.l.Unlock() + + return c.store.StoreLogs(logs) +} + +func (c *LogCache) FirstIndex() (uint64, error) { + return c.store.FirstIndex() +} + +func (c *LogCache) LastIndex() (uint64, error) { + return c.store.LastIndex() +} + +func (c *LogCache) DeleteRange(min, max uint64) error { + // Invalidate the cache on deletes + c.l.Lock() + c.cache = make([]*Log, len(c.cache)) + c.l.Unlock() + + return c.store.DeleteRange(min, max) +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/membership.md b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/membership.md new file mode 100644 index 00000000000..df1f83e27f6 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/membership.md @@ -0,0 +1,83 @@ +Simon (@superfell) and I (@ongardie) talked through reworking this library's cluster membership changes last Friday. We don't see a way to split this into independent patches, so we're taking the next best approach: submitting the plan here for review, then working on an enormous PR. Your feedback would be appreciated. (@superfell is out this week, however, so don't expect him to respond quickly.) + +These are the main goals: + - Bringing things in line with the description in my PhD dissertation; + - Catching up new servers prior to granting them a vote, as well as allowing permanent non-voting members; and + - Eliminating the `peers.json` file, to avoid issues of consistency between that and the log/snapshot. + +## Data-centric view + +We propose to re-define a *configuration* as a set of servers, where each server includes an address (as it does today) and a mode that is either: + - *Voter*: a server whose vote is counted in elections and whose match index is used in advancing the leader's commit index. + - *Nonvoter*: a server that receives log entries but is not considered for elections or commitment purposes. + - *Staging*: a server that acts like a nonvoter with one exception: once a staging server receives enough log entries to catch up sufficiently to the leader's log, the leader will invoke a membership change to change the staging server to a voter. + +All changes to the configuration will be done by writing a new configuration to the log. The new configuration will be in affect as soon as it is appended to the log (not when it is committed like a normal state machine command). Note that, per my dissertation, there can be at most one uncommitted configuration at a time (the next configuration may not be created until the prior one has been committed). It's not strictly necessary to follow these same rules for the nonvoter/staging servers, but we think its best to treat all changes uniformly. + +Each server will track two configurations: + 1. its *committed configuration*: the latest configuration in the log/snapshot that has been committed, along with its index. + 2. its *latest configuration*: the latest configuration in the log/snapshot (may be committed or uncommitted), along with its index. + +When there's no membership change happening, these two will be the same. The latest configuration is almost always the one used, except: + - When followers truncate the suffix of their logs, they may need to fall back to the committed configuration. + - When snapshotting, the committed configuration is written, to correspond with the committed log prefix that is being snapshotted. + + +## Application API + +We propose the following operations for clients to manipulate the cluster configuration: + - AddVoter: server becomes staging unless voter, + - AddNonvoter: server becomes nonvoter unless staging or voter, + - DemoteVoter: server becomes nonvoter unless absent, + - RemovePeer: server removed from configuration, + - GetConfiguration: waits for latest config to commit, returns committed config. + +This diagram, of which I'm quite proud, shows the possible transitions: +``` ++-----------------------------------------------------------------------------+ +| | +| Start -> +--------+ | +| ,------<------------| | | +| / | absent | | +| / RemovePeer--> | | <---RemovePeer | +| / | +--------+ \ | +| / | | \ | +| AddNonvoter | AddVoter \ | +| | ,->---' `--<-. | \ | +| v / \ v \ | +| +----------+ +----------+ +----------+ | +| | | ---AddVoter--> | | -log caught up --> | | | +| | nonvoter | | staging | | voter | | +| | | <-DemoteVoter- | | ,- | | | +| +----------+ \ +----------+ / +----------+ | +| \ / | +| `--------------<---------------' | +| | ++-----------------------------------------------------------------------------+ +``` + +While these operations aren't quite symmetric, we think they're a good set to capture +the possible intent of the user. For example, if I want to make sure a server doesn't have a vote, but the server isn't part of the configuration at all, it probably shouldn't be added as a nonvoting server. + +Each of these application-level operations will be interpreted by the leader and, if it has an effect, will cause the leader to write a new configuration entry to its log. Which particular application-level operation caused the log entry to be written need not be part of the log entry. + +## Code implications + +This is a non-exhaustive list, but we came up with a few things: +- Remove the PeerStore: the `peers.json` file introduces the possibility of getting out of sync with the log and snapshot, and it's hard to maintain this atomically as the log changes. It's not clear whether it's meant to track the committed or latest configuration, either. +- Servers will have to search their snapshot and log to find the committed configuration and the latest configuration on startup. +- Bootstrap will no longer use `peers.json` but should initialize the log or snapshot with an application-provided configuration entry. +- Snapshots should store the index of their configuration along with the configuration itself. In my experience with LogCabin, the original log index of the configuration is very useful to include in debug log messages. +- As noted in hashicorp/raft#84, configuration change requests should come in via a separate channel, and one may not proceed until the last has been committed. +- As to deciding when a log is sufficiently caught up, implementing a sophisticated algorithm *is* something that can be done in a separate PR. An easy and decent placeholder is: once the staging server has reached 95% of the leader's commit index, promote it. + +## Feedback + +Again, we're looking for feedback here before we start working on this. Here are some questions to think about: + - Does this seem like where we want things to go? + - Is there anything here that should be left out? + - Is there anything else we're forgetting about? + - Is there a good way to break this up? + - What do we need to worry about in terms of backwards compatibility? + - What implication will this have on current tests? + - What's the best way to test this code, in particular the small changes that will be sprinkled all over the library? diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/net_transport.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/net_transport.go new file mode 100644 index 00000000000..7c55ac5371f --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/net_transport.go @@ -0,0 +1,622 @@ +package raft + +import ( + "bufio" + "errors" + "fmt" + "io" + "log" + "net" + "os" + "sync" + "time" + + "github.com/hashicorp/go-msgpack/codec" +) + +const ( + rpcAppendEntries uint8 = iota + rpcRequestVote + rpcInstallSnapshot + + // DefaultTimeoutScale is the default TimeoutScale in a NetworkTransport. + DefaultTimeoutScale = 256 * 1024 // 256KB + + // rpcMaxPipeline controls the maximum number of outstanding + // AppendEntries RPC calls. + rpcMaxPipeline = 128 +) + +var ( + // ErrTransportShutdown is returned when operations on a transport are + // invoked after it's been terminated. + ErrTransportShutdown = errors.New("transport shutdown") + + // ErrPipelineShutdown is returned when the pipeline is closed. + ErrPipelineShutdown = errors.New("append pipeline closed") +) + +/* + +NetworkTransport provides a network based transport that can be +used to communicate with Raft on remote machines. It requires +an underlying stream layer to provide a stream abstraction, which can +be simple TCP, TLS, etc. + +This transport is very simple and lightweight. Each RPC request is +framed by sending a byte that indicates the message type, followed +by the MsgPack encoded request. + +The response is an error string followed by the response object, +both are encoded using MsgPack. + +InstallSnapshot is special, in that after the RPC request we stream +the entire state. That socket is not re-used as the connection state +is not known if there is an error. + +*/ +type NetworkTransport struct { + connPool map[ServerAddress][]*netConn + connPoolLock sync.Mutex + + consumeCh chan RPC + + heartbeatFn func(RPC) + heartbeatFnLock sync.Mutex + + logger *log.Logger + + maxPool int + + shutdown bool + shutdownCh chan struct{} + shutdownLock sync.Mutex + + stream StreamLayer + + timeout time.Duration + TimeoutScale int +} + +// StreamLayer is used with the NetworkTransport to provide +// the low level stream abstraction. +type StreamLayer interface { + net.Listener + + // Dial is used to create a new outgoing connection + Dial(address ServerAddress, timeout time.Duration) (net.Conn, error) +} + +type netConn struct { + target ServerAddress + conn net.Conn + r *bufio.Reader + w *bufio.Writer + dec *codec.Decoder + enc *codec.Encoder +} + +func (n *netConn) Release() error { + return n.conn.Close() +} + +type netPipeline struct { + conn *netConn + trans *NetworkTransport + + doneCh chan AppendFuture + inprogressCh chan *appendFuture + + shutdown bool + shutdownCh chan struct{} + shutdownLock sync.Mutex +} + +// NewNetworkTransport creates a new network transport with the given dialer +// and listener. The maxPool controls how many connections we will pool. The +// timeout is used to apply I/O deadlines. For InstallSnapshot, we multiply +// the timeout by (SnapshotSize / TimeoutScale). +func NewNetworkTransport( + stream StreamLayer, + maxPool int, + timeout time.Duration, + logOutput io.Writer, +) *NetworkTransport { + if logOutput == nil { + logOutput = os.Stderr + } + return NewNetworkTransportWithLogger(stream, maxPool, timeout, log.New(logOutput, "", log.LstdFlags)) +} + +// NewNetworkTransportWithLogger creates a new network transport with the given dialer +// and listener. The maxPool controls how many connections we will pool. The +// timeout is used to apply I/O deadlines. For InstallSnapshot, we multiply +// the timeout by (SnapshotSize / TimeoutScale). +func NewNetworkTransportWithLogger( + stream StreamLayer, + maxPool int, + timeout time.Duration, + logger *log.Logger, +) *NetworkTransport { + if logger == nil { + logger = log.New(os.Stderr, "", log.LstdFlags) + } + trans := &NetworkTransport{ + connPool: make(map[ServerAddress][]*netConn), + consumeCh: make(chan RPC), + logger: logger, + maxPool: maxPool, + shutdownCh: make(chan struct{}), + stream: stream, + timeout: timeout, + TimeoutScale: DefaultTimeoutScale, + } + go trans.listen() + return trans +} + +// SetHeartbeatHandler is used to setup a heartbeat handler +// as a fast-pass. This is to avoid head-of-line blocking from +// disk IO. +func (n *NetworkTransport) SetHeartbeatHandler(cb func(rpc RPC)) { + n.heartbeatFnLock.Lock() + defer n.heartbeatFnLock.Unlock() + n.heartbeatFn = cb +} + +// Close is used to stop the network transport. +func (n *NetworkTransport) Close() error { + n.shutdownLock.Lock() + defer n.shutdownLock.Unlock() + + if !n.shutdown { + close(n.shutdownCh) + n.stream.Close() + n.shutdown = true + } + return nil +} + +// Consumer implements the Transport interface. +func (n *NetworkTransport) Consumer() <-chan RPC { + return n.consumeCh +} + +// LocalAddr implements the Transport interface. +func (n *NetworkTransport) LocalAddr() ServerAddress { + return ServerAddress(n.stream.Addr().String()) +} + +// IsShutdown is used to check if the transport is shutdown. +func (n *NetworkTransport) IsShutdown() bool { + select { + case <-n.shutdownCh: + return true + default: + return false + } +} + +// getExistingConn is used to grab a pooled connection. +func (n *NetworkTransport) getPooledConn(target ServerAddress) *netConn { + n.connPoolLock.Lock() + defer n.connPoolLock.Unlock() + + conns, ok := n.connPool[target] + if !ok || len(conns) == 0 { + return nil + } + + var conn *netConn + num := len(conns) + conn, conns[num-1] = conns[num-1], nil + n.connPool[target] = conns[:num-1] + return conn +} + +// getConn is used to get a connection from the pool. +func (n *NetworkTransport) getConn(target ServerAddress) (*netConn, error) { + // Check for a pooled conn + if conn := n.getPooledConn(target); conn != nil { + return conn, nil + } + + // Dial a new connection + conn, err := n.stream.Dial(target, n.timeout) + if err != nil { + return nil, err + } + + // Wrap the conn + netConn := &netConn{ + target: target, + conn: conn, + r: bufio.NewReader(conn), + w: bufio.NewWriter(conn), + } + + // Setup encoder/decoders + netConn.dec = codec.NewDecoder(netConn.r, &codec.MsgpackHandle{}) + netConn.enc = codec.NewEncoder(netConn.w, &codec.MsgpackHandle{}) + + // Done + return netConn, nil +} + +// returnConn returns a connection back to the pool. +func (n *NetworkTransport) returnConn(conn *netConn) { + n.connPoolLock.Lock() + defer n.connPoolLock.Unlock() + + key := conn.target + conns, _ := n.connPool[key] + + if !n.IsShutdown() && len(conns) < n.maxPool { + n.connPool[key] = append(conns, conn) + } else { + conn.Release() + } +} + +// AppendEntriesPipeline returns an interface that can be used to pipeline +// AppendEntries requests. +func (n *NetworkTransport) AppendEntriesPipeline(target ServerAddress) (AppendPipeline, error) { + // Get a connection + conn, err := n.getConn(target) + if err != nil { + return nil, err + } + + // Create the pipeline + return newNetPipeline(n, conn), nil +} + +// AppendEntries implements the Transport interface. +func (n *NetworkTransport) AppendEntries(target ServerAddress, args *AppendEntriesRequest, resp *AppendEntriesResponse) error { + return n.genericRPC(target, rpcAppendEntries, args, resp) +} + +// RequestVote implements the Transport interface. +func (n *NetworkTransport) RequestVote(target ServerAddress, args *RequestVoteRequest, resp *RequestVoteResponse) error { + return n.genericRPC(target, rpcRequestVote, args, resp) +} + +// genericRPC handles a simple request/response RPC. +func (n *NetworkTransport) genericRPC(target ServerAddress, rpcType uint8, args interface{}, resp interface{}) error { + // Get a conn + conn, err := n.getConn(target) + if err != nil { + return err + } + + // Set a deadline + if n.timeout > 0 { + conn.conn.SetDeadline(time.Now().Add(n.timeout)) + } + + // Send the RPC + if err = sendRPC(conn, rpcType, args); err != nil { + return err + } + + // Decode the response + canReturn, err := decodeResponse(conn, resp) + if canReturn { + n.returnConn(conn) + } + return err +} + +// InstallSnapshot implements the Transport interface. +func (n *NetworkTransport) InstallSnapshot(target ServerAddress, args *InstallSnapshotRequest, resp *InstallSnapshotResponse, data io.Reader) error { + // Get a conn, always close for InstallSnapshot + conn, err := n.getConn(target) + if err != nil { + return err + } + defer conn.Release() + + // Set a deadline, scaled by request size + if n.timeout > 0 { + timeout := n.timeout * time.Duration(args.Size/int64(n.TimeoutScale)) + if timeout < n.timeout { + timeout = n.timeout + } + conn.conn.SetDeadline(time.Now().Add(timeout)) + } + + // Send the RPC + if err = sendRPC(conn, rpcInstallSnapshot, args); err != nil { + return err + } + + // Stream the state + if _, err = io.Copy(conn.w, data); err != nil { + return err + } + + // Flush + if err = conn.w.Flush(); err != nil { + return err + } + + // Decode the response, do not return conn + _, err = decodeResponse(conn, resp) + return err +} + +// EncodePeer implements the Transport interface. +func (n *NetworkTransport) EncodePeer(p ServerAddress) []byte { + return []byte(p) +} + +// DecodePeer implements the Transport interface. +func (n *NetworkTransport) DecodePeer(buf []byte) ServerAddress { + return ServerAddress(buf) +} + +// listen is used to handling incoming connections. +func (n *NetworkTransport) listen() { + for { + // Accept incoming connections + conn, err := n.stream.Accept() + if err != nil { + if n.IsShutdown() { + return + } + n.logger.Printf("[ERR] raft-net: Failed to accept connection: %v", err) + continue + } + n.logger.Printf("[DEBUG] raft-net: %v accepted connection from: %v", n.LocalAddr(), conn.RemoteAddr()) + + // Handle the connection in dedicated routine + go n.handleConn(conn) + } +} + +// handleConn is used to handle an inbound connection for its lifespan. +func (n *NetworkTransport) handleConn(conn net.Conn) { + defer conn.Close() + r := bufio.NewReader(conn) + w := bufio.NewWriter(conn) + dec := codec.NewDecoder(r, &codec.MsgpackHandle{}) + enc := codec.NewEncoder(w, &codec.MsgpackHandle{}) + + for { + if err := n.handleCommand(r, dec, enc); err != nil { + if err != io.EOF { + n.logger.Printf("[ERR] raft-net: Failed to decode incoming command: %v", err) + } + return + } + if err := w.Flush(); err != nil { + n.logger.Printf("[ERR] raft-net: Failed to flush response: %v", err) + return + } + } +} + +// handleCommand is used to decode and dispatch a single command. +func (n *NetworkTransport) handleCommand(r *bufio.Reader, dec *codec.Decoder, enc *codec.Encoder) error { + // Get the rpc type + rpcType, err := r.ReadByte() + if err != nil { + return err + } + + // Create the RPC object + respCh := make(chan RPCResponse, 1) + rpc := RPC{ + RespChan: respCh, + } + + // Decode the command + isHeartbeat := false + switch rpcType { + case rpcAppendEntries: + var req AppendEntriesRequest + if err := dec.Decode(&req); err != nil { + return err + } + rpc.Command = &req + + // Check if this is a heartbeat + if req.Term != 0 && req.Leader != nil && + req.PrevLogEntry == 0 && req.PrevLogTerm == 0 && + len(req.Entries) == 0 && req.LeaderCommitIndex == 0 { + isHeartbeat = true + } + + case rpcRequestVote: + var req RequestVoteRequest + if err := dec.Decode(&req); err != nil { + return err + } + rpc.Command = &req + + case rpcInstallSnapshot: + var req InstallSnapshotRequest + if err := dec.Decode(&req); err != nil { + return err + } + rpc.Command = &req + rpc.Reader = io.LimitReader(r, req.Size) + + default: + return fmt.Errorf("unknown rpc type %d", rpcType) + } + + // Check for heartbeat fast-path + if isHeartbeat { + n.heartbeatFnLock.Lock() + fn := n.heartbeatFn + n.heartbeatFnLock.Unlock() + if fn != nil { + fn(rpc) + goto RESP + } + } + + // Dispatch the RPC + select { + case n.consumeCh <- rpc: + case <-n.shutdownCh: + return ErrTransportShutdown + } + + // Wait for response +RESP: + select { + case resp := <-respCh: + // Send the error first + respErr := "" + if resp.Error != nil { + respErr = resp.Error.Error() + } + if err := enc.Encode(respErr); err != nil { + return err + } + + // Send the response + if err := enc.Encode(resp.Response); err != nil { + return err + } + case <-n.shutdownCh: + return ErrTransportShutdown + } + return nil +} + +// decodeResponse is used to decode an RPC response and reports whether +// the connection can be reused. +func decodeResponse(conn *netConn, resp interface{}) (bool, error) { + // Decode the error if any + var rpcError string + if err := conn.dec.Decode(&rpcError); err != nil { + conn.Release() + return false, err + } + + // Decode the response + if err := conn.dec.Decode(resp); err != nil { + conn.Release() + return false, err + } + + // Format an error if any + if rpcError != "" { + return true, fmt.Errorf(rpcError) + } + return true, nil +} + +// sendRPC is used to encode and send the RPC. +func sendRPC(conn *netConn, rpcType uint8, args interface{}) error { + // Write the request type + if err := conn.w.WriteByte(rpcType); err != nil { + conn.Release() + return err + } + + // Send the request + if err := conn.enc.Encode(args); err != nil { + conn.Release() + return err + } + + // Flush + if err := conn.w.Flush(); err != nil { + conn.Release() + return err + } + return nil +} + +// newNetPipeline is used to construct a netPipeline from a given +// transport and connection. +func newNetPipeline(trans *NetworkTransport, conn *netConn) *netPipeline { + n := &netPipeline{ + conn: conn, + trans: trans, + doneCh: make(chan AppendFuture, rpcMaxPipeline), + inprogressCh: make(chan *appendFuture, rpcMaxPipeline), + shutdownCh: make(chan struct{}), + } + go n.decodeResponses() + return n +} + +// decodeResponses is a long running routine that decodes the responses +// sent on the connection. +func (n *netPipeline) decodeResponses() { + timeout := n.trans.timeout + for { + select { + case future := <-n.inprogressCh: + if timeout > 0 { + n.conn.conn.SetReadDeadline(time.Now().Add(timeout)) + } + + _, err := decodeResponse(n.conn, future.resp) + future.respond(err) + select { + case n.doneCh <- future: + case <-n.shutdownCh: + return + } + case <-n.shutdownCh: + return + } + } +} + +// AppendEntries is used to pipeline a new append entries request. +func (n *netPipeline) AppendEntries(args *AppendEntriesRequest, resp *AppendEntriesResponse) (AppendFuture, error) { + // Create a new future + future := &appendFuture{ + start: time.Now(), + args: args, + resp: resp, + } + future.init() + + // Add a send timeout + if timeout := n.trans.timeout; timeout > 0 { + n.conn.conn.SetWriteDeadline(time.Now().Add(timeout)) + } + + // Send the RPC + if err := sendRPC(n.conn, rpcAppendEntries, future.args); err != nil { + return nil, err + } + + // Hand-off for decoding, this can also cause back-pressure + // to prevent too many inflight requests + select { + case n.inprogressCh <- future: + return future, nil + case <-n.shutdownCh: + return nil, ErrPipelineShutdown + } +} + +// Consumer returns a channel that can be used to consume complete futures. +func (n *netPipeline) Consumer() <-chan AppendFuture { + return n.doneCh +} + +// Closed is used to shutdown the pipeline connection. +func (n *netPipeline) Close() error { + n.shutdownLock.Lock() + defer n.shutdownLock.Unlock() + if n.shutdown { + return nil + } + + // Release the connection + n.conn.Release() + + n.shutdown = true + close(n.shutdownCh) + return nil +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/observer.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/observer.go new file mode 100644 index 00000000000..22500fa875f --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/observer.go @@ -0,0 +1,115 @@ +package raft + +import ( + "sync/atomic" +) + +// Observation is sent along the given channel to observers when an event occurs. +type Observation struct { + // Raft holds the Raft instance generating the observation. + Raft *Raft + // Data holds observation-specific data. Possible types are + // *RequestVoteRequest and RaftState. + Data interface{} +} + +// nextObserverId is used to provide a unique ID for each observer to aid in +// deregistration. +var nextObserverID uint64 + +// FilterFn is a function that can be registered in order to filter observations. +// The function reports whether the observation should be included - if +// it returns false, the observation will be filtered out. +type FilterFn func(o *Observation) bool + +// Observer describes what to do with a given observation. +type Observer struct { + // channel receives observations. + channel chan Observation + + // blocking, if true, will cause Raft to block when sending an observation + // to this observer. This should generally be set to false. + blocking bool + + // filter will be called to determine if an observation should be sent to + // the channel. + filter FilterFn + + // id is the ID of this observer in the Raft map. + id uint64 + + // numObserved and numDropped are performance counters for this observer. + numObserved uint64 + numDropped uint64 +} + +// NewObserver creates a new observer that can be registered +// to make observations on a Raft instance. Observations +// will be sent on the given channel if they satisfy the +// given filter. +// +// If blocking is true, the observer will block when it can't +// send on the channel, otherwise it may discard events. +func NewObserver(channel chan Observation, blocking bool, filter FilterFn) *Observer { + return &Observer{ + channel: channel, + blocking: blocking, + filter: filter, + id: atomic.AddUint64(&nextObserverID, 1), + } +} + +// GetNumObserved returns the number of observations. +func (or *Observer) GetNumObserved() uint64 { + return atomic.LoadUint64(&or.numObserved) +} + +// GetNumDropped returns the number of dropped observations due to blocking. +func (or *Observer) GetNumDropped() uint64 { + return atomic.LoadUint64(&or.numDropped) +} + +// RegisterObserver registers a new observer. +func (r *Raft) RegisterObserver(or *Observer) { + r.observersLock.Lock() + defer r.observersLock.Unlock() + r.observers[or.id] = or +} + +// DeregisterObserver deregisters an observer. +func (r *Raft) DeregisterObserver(or *Observer) { + r.observersLock.Lock() + defer r.observersLock.Unlock() + delete(r.observers, or.id) +} + +// observe sends an observation to every observer. +func (r *Raft) observe(o interface{}) { + // In general observers should not block. But in any case this isn't + // disastrous as we only hold a read lock, which merely prevents + // registration / deregistration of observers. + r.observersLock.RLock() + defer r.observersLock.RUnlock() + for _, or := range r.observers { + // It's wasteful to do this in the loop, but for the common case + // where there are no observers we won't create any objects. + ob := Observation{Raft: r, Data: o} + if or.filter != nil && !or.filter(&ob) { + continue + } + if or.channel == nil { + continue + } + if or.blocking { + or.channel <- ob + atomic.AddUint64(&or.numObserved, 1) + } else { + select { + case or.channel <- ob: + atomic.AddUint64(&or.numObserved, 1) + default: + atomic.AddUint64(&or.numDropped, 1) + } + } + } +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/peersjson.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/peersjson.go new file mode 100644 index 00000000000..c55fdbb43dd --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/peersjson.go @@ -0,0 +1,46 @@ +package raft + +import ( + "bytes" + "encoding/json" + "io/ioutil" +) + +// ReadPeersJSON consumes a legacy peers.json file in the format of the old JSON +// peer store and creates a new-style configuration structure. This can be used +// to migrate this data or perform manual recovery when running protocol versions +// that can interoperate with older, unversioned Raft servers. This should not be +// used once server IDs are in use, because the old peers.json file didn't have +// support for these, nor non-voter suffrage types. +func ReadPeersJSON(path string) (Configuration, error) { + // Read in the file. + buf, err := ioutil.ReadFile(path) + if err != nil { + return Configuration{}, err + } + + // Parse it as JSON. + var peers []string + dec := json.NewDecoder(bytes.NewReader(buf)) + if err := dec.Decode(&peers); err != nil { + return Configuration{}, err + } + + // Map it into the new-style configuration structure. We can only specify + // voter roles here, and the ID has to be the same as the address. + var configuration Configuration + for _, peer := range peers { + server := Server{ + Suffrage: Voter, + ID: ServerID(peer), + Address: ServerAddress(peer), + } + configuration.Servers = append(configuration.Servers, server) + } + + // We should only ingest valid configurations. + if err := checkConfiguration(configuration); err != nil { + return Configuration{}, err + } + return configuration, nil +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/raft.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/raft.go new file mode 100644 index 00000000000..a6c729413a8 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/raft.go @@ -0,0 +1,1348 @@ +package raft + +import ( + "bytes" + "container/list" + "fmt" + "io" + "time" + + "github.com/armon/go-metrics" +) + +const ( + minCheckInterval = 10 * time.Millisecond +) + +var ( + keyCurrentTerm = []byte("CurrentTerm") + keyLastVoteTerm = []byte("LastVoteTerm") + keyLastVoteCand = []byte("LastVoteCand") +) + +// getRPCHeader returns an initialized RPCHeader struct for the given +// Raft instance. This structure is sent along with RPC requests and +// responses. +func (r *Raft) getRPCHeader() RPCHeader { + return RPCHeader{ + ProtocolVersion: r.conf.ProtocolVersion, + } +} + +// checkRPCHeader houses logic about whether this instance of Raft can process +// the given RPC message. +func (r *Raft) checkRPCHeader(rpc RPC) error { + // Get the header off the RPC message. + wh, ok := rpc.Command.(WithRPCHeader) + if !ok { + return fmt.Errorf("RPC does not have a header") + } + header := wh.GetRPCHeader() + + // First check is to just make sure the code can understand the + // protocol at all. + if header.ProtocolVersion < ProtocolVersionMin || + header.ProtocolVersion > ProtocolVersionMax { + return ErrUnsupportedProtocol + } + + // Second check is whether we should support this message, given the + // current protocol we are configured to run. This will drop support + // for protocol version 0 starting at protocol version 2, which is + // currently what we want, and in general support one version back. We + // may need to revisit this policy depending on how future protocol + // changes evolve. + if header.ProtocolVersion < r.conf.ProtocolVersion-1 { + return ErrUnsupportedProtocol + } + + return nil +} + +// getSnapshotVersion returns the snapshot version that should be used when +// creating snapshots, given the protocol version in use. +func getSnapshotVersion(protocolVersion ProtocolVersion) SnapshotVersion { + // Right now we only have two versions and they are backwards compatible + // so we don't need to look at the protocol version. + return 1 +} + +// commitTuple is used to send an index that was committed, +// with an optional associated future that should be invoked. +type commitTuple struct { + log *Log + future *logFuture +} + +// leaderState is state that is used while we are a leader. +type leaderState struct { + commitCh chan struct{} + commitment *commitment + inflight *list.List // list of logFuture in log index order + replState map[ServerID]*followerReplication + notify map[*verifyFuture]struct{} + stepDown chan struct{} +} + +// setLeader is used to modify the current leader of the cluster +func (r *Raft) setLeader(leader ServerAddress) { + r.leaderLock.Lock() + r.leader = leader + r.leaderLock.Unlock() +} + +// requestConfigChange is a helper for the above functions that make +// configuration change requests. 'req' describes the change. For timeout, +// see AddVoter. +func (r *Raft) requestConfigChange(req configurationChangeRequest, timeout time.Duration) IndexFuture { + var timer <-chan time.Time + if timeout > 0 { + timer = time.After(timeout) + } + future := &configurationChangeFuture{ + req: req, + } + future.init() + select { + case <-timer: + return errorFuture{ErrEnqueueTimeout} + case r.configurationChangeCh <- future: + return future + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + } +} + +// run is a long running goroutine that runs the Raft FSM. +func (r *Raft) run() { + for { + // Check if we are doing a shutdown + select { + case <-r.shutdownCh: + // Clear the leader to prevent forwarding + r.setLeader("") + return + default: + } + + // Enter into a sub-FSM + switch r.getState() { + case Follower: + r.runFollower() + case Candidate: + r.runCandidate() + case Leader: + r.runLeader() + } + } +} + +// runFollower runs the FSM for a follower. +func (r *Raft) runFollower() { + didWarn := false + r.logger.Printf("[INFO] raft: %v entering Follower state (Leader: %q)", r, r.Leader()) + metrics.IncrCounter([]string{"raft", "state", "follower"}, 1) + heartbeatTimer := randomTimeout(r.conf.HeartbeatTimeout) + for { + select { + case rpc := <-r.rpcCh: + r.processRPC(rpc) + + case c := <-r.configurationChangeCh: + // Reject any operations since we are not the leader + c.respond(ErrNotLeader) + + case a := <-r.applyCh: + // Reject any operations since we are not the leader + a.respond(ErrNotLeader) + + case v := <-r.verifyCh: + // Reject any operations since we are not the leader + v.respond(ErrNotLeader) + + case c := <-r.configurationsCh: + c.configurations = r.configurations.Clone() + c.respond(nil) + + case b := <-r.bootstrapCh: + b.respond(r.liveBootstrap(b.configuration)) + + case <-heartbeatTimer: + // Restart the heartbeat timer + heartbeatTimer = randomTimeout(r.conf.HeartbeatTimeout) + + // Check if we have had a successful contact + lastContact := r.LastContact() + if time.Now().Sub(lastContact) < r.conf.HeartbeatTimeout { + continue + } + + // Heartbeat failed! Transition to the candidate state + lastLeader := r.Leader() + r.setLeader("") + + if r.configurations.latestIndex == 0 { + if !didWarn { + r.logger.Printf("[WARN] raft: no known peers, aborting election") + didWarn = true + } + } else if r.configurations.latestIndex == r.configurations.committedIndex && + !hasVote(r.configurations.latest, r.localID) { + if !didWarn { + r.logger.Printf("[WARN] raft: not part of stable configuration, aborting election") + didWarn = true + } + } else { + r.logger.Printf(`[WARN] raft: Heartbeat timeout from %q reached, starting election`, lastLeader) + metrics.IncrCounter([]string{"raft", "transition", "heartbeat_timeout"}, 1) + r.setState(Candidate) + return + } + + case <-r.shutdownCh: + return + } + } +} + +// liveBootstrap attempts to seed an initial configuration for the cluster. See +// the Raft object's member BootstrapCluster for more details. This must only be +// called on the main thread, and only makes sense in the follower state. +func (r *Raft) liveBootstrap(configuration Configuration) error { + // Use the pre-init API to make the static updates. + err := BootstrapCluster(&r.conf, r.logs, r.stable, r.snapshots, + r.trans, configuration) + if err != nil { + return err + } + + // Make the configuration live. + var entry Log + if err := r.logs.GetLog(1, &entry); err != nil { + panic(err) + } + r.setCurrentTerm(1) + r.setLastLog(entry.Index, entry.Term) + r.processConfigurationLogEntry(&entry) + return nil +} + +// runCandidate runs the FSM for a candidate. +func (r *Raft) runCandidate() { + r.logger.Printf("[INFO] raft: %v entering Candidate state in term %v", + r, r.getCurrentTerm()+1) + metrics.IncrCounter([]string{"raft", "state", "candidate"}, 1) + + // Start vote for us, and set a timeout + voteCh := r.electSelf() + electionTimer := randomTimeout(r.conf.ElectionTimeout) + + // Tally the votes, need a simple majority + grantedVotes := 0 + votesNeeded := r.quorumSize() + r.logger.Printf("[DEBUG] raft: Votes needed: %d", votesNeeded) + + for r.getState() == Candidate { + select { + case rpc := <-r.rpcCh: + r.processRPC(rpc) + + case vote := <-voteCh: + // Check if the term is greater than ours, bail + if vote.Term > r.getCurrentTerm() { + r.logger.Printf("[DEBUG] raft: Newer term discovered, fallback to follower") + r.setState(Follower) + r.setCurrentTerm(vote.Term) + return + } + + // Check if the vote is granted + if vote.Granted { + grantedVotes++ + r.logger.Printf("[DEBUG] raft: Vote granted from %s in term %v. Tally: %d", + vote.voterID, vote.Term, grantedVotes) + } + + // Check if we've become the leader + if grantedVotes >= votesNeeded { + r.logger.Printf("[INFO] raft: Election won. Tally: %d", grantedVotes) + r.setState(Leader) + r.setLeader(r.localAddr) + return + } + + case c := <-r.configurationChangeCh: + // Reject any operations since we are not the leader + c.respond(ErrNotLeader) + + case a := <-r.applyCh: + // Reject any operations since we are not the leader + a.respond(ErrNotLeader) + + case v := <-r.verifyCh: + // Reject any operations since we are not the leader + v.respond(ErrNotLeader) + + case c := <-r.configurationsCh: + c.configurations = r.configurations.Clone() + c.respond(nil) + + case b := <-r.bootstrapCh: + b.respond(ErrCantBootstrap) + + case <-electionTimer: + // Election failed! Restart the election. We simply return, + // which will kick us back into runCandidate + r.logger.Printf("[WARN] raft: Election timeout reached, restarting election") + return + + case <-r.shutdownCh: + return + } + } +} + +// runLeader runs the FSM for a leader. Do the setup here and drop into +// the leaderLoop for the hot loop. +func (r *Raft) runLeader() { + r.logger.Printf("[INFO] raft: %v entering Leader state", r) + metrics.IncrCounter([]string{"raft", "state", "leader"}, 1) + + // Notify that we are the leader + asyncNotifyBool(r.leaderCh, true) + + // Push to the notify channel if given + if notify := r.conf.NotifyCh; notify != nil { + select { + case notify <- true: + case <-r.shutdownCh: + } + } + + // Setup leader state + r.leaderState.commitCh = make(chan struct{}, 1) + r.leaderState.commitment = newCommitment(r.leaderState.commitCh, + r.configurations.latest, + r.getLastIndex()+1 /* first index that may be committed in this term */) + r.leaderState.inflight = list.New() + r.leaderState.replState = make(map[ServerID]*followerReplication) + r.leaderState.notify = make(map[*verifyFuture]struct{}) + r.leaderState.stepDown = make(chan struct{}, 1) + + // Cleanup state on step down + defer func() { + // Since we were the leader previously, we update our + // last contact time when we step down, so that we are not + // reporting a last contact time from before we were the + // leader. Otherwise, to a client it would seem our data + // is extremely stale. + r.setLastContact() + + // Stop replication + for _, p := range r.leaderState.replState { + close(p.stopCh) + } + + // Respond to all inflight operations + for e := r.leaderState.inflight.Front(); e != nil; e = e.Next() { + e.Value.(*logFuture).respond(ErrLeadershipLost) + } + + // Respond to any pending verify requests + for future := range r.leaderState.notify { + future.respond(ErrLeadershipLost) + } + + // Clear all the state + r.leaderState.commitCh = nil + r.leaderState.commitment = nil + r.leaderState.inflight = nil + r.leaderState.replState = nil + r.leaderState.notify = nil + r.leaderState.stepDown = nil + + // If we are stepping down for some reason, no known leader. + // We may have stepped down due to an RPC call, which would + // provide the leader, so we cannot always blank this out. + r.leaderLock.Lock() + if r.leader == r.localAddr { + r.leader = "" + } + r.leaderLock.Unlock() + + // Notify that we are not the leader + asyncNotifyBool(r.leaderCh, false) + + // Push to the notify channel if given + if notify := r.conf.NotifyCh; notify != nil { + select { + case notify <- false: + case <-r.shutdownCh: + // On shutdown, make a best effort but do not block + select { + case notify <- false: + default: + } + } + } + }() + + // Start a replication routine for each peer + r.startStopReplication() + + // Dispatch a no-op log entry first. This gets this leader up to the latest + // possible commit index, even in the absence of client commands. This used + // to append a configuration entry instead of a noop. However, that permits + // an unbounded number of uncommitted configurations in the log. We now + // maintain that there exists at most one uncommitted configuration entry in + // any log, so we have to do proper no-ops here. + noop := &logFuture{ + log: Log{ + Type: LogNoop, + }, + } + r.dispatchLogs([]*logFuture{noop}) + + // Sit in the leader loop until we step down + r.leaderLoop() +} + +// startStopReplication will set up state and start asynchronous replication to +// new peers, and stop replication to removed peers. Before removing a peer, +// it'll instruct the replication routines to try to replicate to the current +// index. This must only be called from the main thread. +func (r *Raft) startStopReplication() { + inConfig := make(map[ServerID]bool, len(r.configurations.latest.Servers)) + lastIdx := r.getLastIndex() + + // Start replication goroutines that need starting + for _, server := range r.configurations.latest.Servers { + if server.ID == r.localID { + continue + } + inConfig[server.ID] = true + if _, ok := r.leaderState.replState[server.ID]; !ok { + r.logger.Printf("[INFO] raft: Added peer %v, starting replication", server.ID) + s := &followerReplication{ + peer: server, + commitment: r.leaderState.commitment, + stopCh: make(chan uint64, 1), + triggerCh: make(chan struct{}, 1), + currentTerm: r.getCurrentTerm(), + nextIndex: lastIdx + 1, + lastContact: time.Now(), + notifyCh: make(chan struct{}, 1), + stepDown: r.leaderState.stepDown, + } + r.leaderState.replState[server.ID] = s + r.goFunc(func() { r.replicate(s) }) + asyncNotifyCh(s.triggerCh) + } + } + + // Stop replication goroutines that need stopping + for serverID, repl := range r.leaderState.replState { + if inConfig[serverID] { + continue + } + // Replicate up to lastIdx and stop + r.logger.Printf("[INFO] raft: Removed peer %v, stopping replication after %v", serverID, lastIdx) + repl.stopCh <- lastIdx + close(repl.stopCh) + delete(r.leaderState.replState, serverID) + } +} + +// configurationChangeChIfStable returns r.configurationChangeCh if it's safe +// to process requests from it, or nil otherwise. This must only be called +// from the main thread. +// +// Note that if the conditions here were to change outside of leaderLoop to take +// this from nil to non-nil, we would need leaderLoop to be kicked. +func (r *Raft) configurationChangeChIfStable() chan *configurationChangeFuture { + // Have to wait until: + // 1. The latest configuration is committed, and + // 2. This leader has committed some entry (the noop) in this term + // https://groups.google.com/forum/#!msg/raft-dev/t4xj6dJTP6E/d2D9LrWRza8J + if r.configurations.latestIndex == r.configurations.committedIndex && + r.getCommitIndex() >= r.leaderState.commitment.startIndex { + return r.configurationChangeCh + } + return nil +} + +// leaderLoop is the hot loop for a leader. It is invoked +// after all the various leader setup is done. +func (r *Raft) leaderLoop() { + // stepDown is used to track if there is an inflight log that + // would cause us to lose leadership (specifically a RemovePeer of + // ourselves). If this is the case, we must not allow any logs to + // be processed in parallel, otherwise we are basing commit on + // only a single peer (ourself) and replicating to an undefined set + // of peers. + stepDown := false + + lease := time.After(r.conf.LeaderLeaseTimeout) + for r.getState() == Leader { + select { + case rpc := <-r.rpcCh: + r.processRPC(rpc) + + case <-r.leaderState.stepDown: + r.setState(Follower) + + case <-r.leaderState.commitCh: + // Process the newly committed entries + oldCommitIndex := r.getCommitIndex() + commitIndex := r.leaderState.commitment.getCommitIndex() + r.setCommitIndex(commitIndex) + + if r.configurations.latestIndex > oldCommitIndex && + r.configurations.latestIndex <= commitIndex { + r.configurations.committed = r.configurations.latest + r.configurations.committedIndex = r.configurations.latestIndex + if !hasVote(r.configurations.committed, r.localID) { + stepDown = true + } + } + + for { + e := r.leaderState.inflight.Front() + if e == nil { + break + } + commitLog := e.Value.(*logFuture) + idx := commitLog.log.Index + if idx > commitIndex { + break + } + // Measure the commit time + metrics.MeasureSince([]string{"raft", "commitTime"}, commitLog.dispatch) + r.processLogs(idx, commitLog) + r.leaderState.inflight.Remove(e) + } + + if stepDown { + if r.conf.ShutdownOnRemove { + r.logger.Printf("[INFO] raft: Removed ourself, shutting down") + r.Shutdown() + } else { + r.logger.Printf("[INFO] raft: Removed ourself, transitioning to follower") + r.setState(Follower) + } + } + + case v := <-r.verifyCh: + if v.quorumSize == 0 { + // Just dispatched, start the verification + r.verifyLeader(v) + + } else if v.votes < v.quorumSize { + // Early return, means there must be a new leader + r.logger.Printf("[WARN] raft: New leader elected, stepping down") + r.setState(Follower) + delete(r.leaderState.notify, v) + v.respond(ErrNotLeader) + + } else { + // Quorum of members agree, we are still leader + delete(r.leaderState.notify, v) + v.respond(nil) + } + + case c := <-r.configurationsCh: + c.configurations = r.configurations.Clone() + c.respond(nil) + + case future := <-r.configurationChangeChIfStable(): + r.appendConfigurationEntry(future) + + case b := <-r.bootstrapCh: + b.respond(ErrCantBootstrap) + + case newLog := <-r.applyCh: + // Group commit, gather all the ready commits + ready := []*logFuture{newLog} + for i := 0; i < r.conf.MaxAppendEntries; i++ { + select { + case newLog := <-r.applyCh: + ready = append(ready, newLog) + default: + break + } + } + + // Dispatch the logs + if stepDown { + // we're in the process of stepping down as leader, don't process anything new + for i := range ready { + ready[i].respond(ErrNotLeader) + } + } else { + r.dispatchLogs(ready) + } + + case <-lease: + // Check if we've exceeded the lease, potentially stepping down + maxDiff := r.checkLeaderLease() + + // Next check interval should adjust for the last node we've + // contacted, without going negative + checkInterval := r.conf.LeaderLeaseTimeout - maxDiff + if checkInterval < minCheckInterval { + checkInterval = minCheckInterval + } + + // Renew the lease timer + lease = time.After(checkInterval) + + case <-r.shutdownCh: + return + } + } +} + +// verifyLeader must be called from the main thread for safety. +// Causes the followers to attempt an immediate heartbeat. +func (r *Raft) verifyLeader(v *verifyFuture) { + // Current leader always votes for self + v.votes = 1 + + // Set the quorum size, hot-path for single node + v.quorumSize = r.quorumSize() + if v.quorumSize == 1 { + v.respond(nil) + return + } + + // Track this request + v.notifyCh = r.verifyCh + r.leaderState.notify[v] = struct{}{} + + // Trigger immediate heartbeats + for _, repl := range r.leaderState.replState { + repl.notifyLock.Lock() + repl.notify = append(repl.notify, v) + repl.notifyLock.Unlock() + asyncNotifyCh(repl.notifyCh) + } +} + +// checkLeaderLease is used to check if we can contact a quorum of nodes +// within the last leader lease interval. If not, we need to step down, +// as we may have lost connectivity. Returns the maximum duration without +// contact. This must only be called from the main thread. +func (r *Raft) checkLeaderLease() time.Duration { + // Track contacted nodes, we can always contact ourself + contacted := 1 + + // Check each follower + var maxDiff time.Duration + now := time.Now() + for peer, f := range r.leaderState.replState { + diff := now.Sub(f.LastContact()) + if diff <= r.conf.LeaderLeaseTimeout { + contacted++ + if diff > maxDiff { + maxDiff = diff + } + } else { + // Log at least once at high value, then debug. Otherwise it gets very verbose. + if diff <= 3*r.conf.LeaderLeaseTimeout { + r.logger.Printf("[WARN] raft: Failed to contact %v in %v", peer, diff) + } else { + r.logger.Printf("[DEBUG] raft: Failed to contact %v in %v", peer, diff) + } + } + metrics.AddSample([]string{"raft", "leader", "lastContact"}, float32(diff/time.Millisecond)) + } + + // Verify we can contact a quorum + quorum := r.quorumSize() + if contacted < quorum { + r.logger.Printf("[WARN] raft: Failed to contact quorum of nodes, stepping down") + r.setState(Follower) + metrics.IncrCounter([]string{"raft", "transition", "leader_lease_timeout"}, 1) + } + return maxDiff +} + +// quorumSize is used to return the quorum size. This must only be called on +// the main thread. +// TODO: revisit usage +func (r *Raft) quorumSize() int { + voters := 0 + for _, server := range r.configurations.latest.Servers { + if server.Suffrage == Voter { + voters++ + } + } + return voters/2 + 1 +} + +// appendConfigurationEntry changes the configuration and adds a new +// configuration entry to the log. This must only be called from the +// main thread. +func (r *Raft) appendConfigurationEntry(future *configurationChangeFuture) { + configuration, err := nextConfiguration(r.configurations.latest, r.configurations.latestIndex, future.req) + if err != nil { + future.respond(err) + return + } + + r.logger.Printf("[INFO] raft: Updating configuration with %s (%v, %v) to %+v", + future.req.command, future.req.serverID, future.req.serverAddress, configuration.Servers) + + // In pre-ID compatibility mode we translate all configuration changes + // in to an old remove peer message, which can handle all supported + // cases for peer changes in the pre-ID world (adding and removing + // voters). Both add peer and remove peer log entries are handled + // similarly on old Raft servers, but remove peer does extra checks to + // see if a leader needs to step down. Since they both assert the full + // configuration, then we can safely call remove peer for everything. + if r.protocolVersion < 2 { + future.log = Log{ + Type: LogRemovePeerDeprecated, + Data: encodePeers(configuration, r.trans), + } + } else { + future.log = Log{ + Type: LogConfiguration, + Data: encodeConfiguration(configuration), + } + } + + r.dispatchLogs([]*logFuture{&future.logFuture}) + index := future.Index() + r.configurations.latest = configuration + r.configurations.latestIndex = index + r.leaderState.commitment.setConfiguration(configuration) + r.startStopReplication() +} + +// dispatchLog is called on the leader to push a log to disk, mark it +// as inflight and begin replication of it. +func (r *Raft) dispatchLogs(applyLogs []*logFuture) { + now := time.Now() + defer metrics.MeasureSince([]string{"raft", "leader", "dispatchLog"}, now) + + term := r.getCurrentTerm() + lastIndex := r.getLastIndex() + logs := make([]*Log, len(applyLogs)) + + for idx, applyLog := range applyLogs { + applyLog.dispatch = now + lastIndex++ + applyLog.log.Index = lastIndex + applyLog.log.Term = term + logs[idx] = &applyLog.log + r.leaderState.inflight.PushBack(applyLog) + } + + // Write the log entry locally + if err := r.logs.StoreLogs(logs); err != nil { + r.logger.Printf("[ERR] raft: Failed to commit logs: %v", err) + for _, applyLog := range applyLogs { + applyLog.respond(err) + } + r.setState(Follower) + return + } + r.leaderState.commitment.match(r.localID, lastIndex) + + // Update the last log since it's on disk now + r.setLastLog(lastIndex, term) + + // Notify the replicators of the new log + for _, f := range r.leaderState.replState { + asyncNotifyCh(f.triggerCh) + } +} + +// processLogs is used to apply all the committed entires that haven't been +// applied up to the given index limit. +// This can be called from both leaders and followers. +// Followers call this from AppendEntires, for n entires at a time, and always +// pass future=nil. +// Leaders call this once per inflight when entries are committed. They pass +// the future from inflights. +func (r *Raft) processLogs(index uint64, future *logFuture) { + // Reject logs we've applied already + lastApplied := r.getLastApplied() + if index <= lastApplied { + r.logger.Printf("[WARN] raft: Skipping application of old log: %d", index) + return + } + + // Apply all the preceding logs + for idx := r.getLastApplied() + 1; idx <= index; idx++ { + // Get the log, either from the future or from our log store + if future != nil && future.log.Index == idx { + r.processLog(&future.log, future) + + } else { + l := new(Log) + if err := r.logs.GetLog(idx, l); err != nil { + r.logger.Printf("[ERR] raft: Failed to get log at %d: %v", idx, err) + panic(err) + } + r.processLog(l, nil) + } + + // Update the lastApplied index and term + r.setLastApplied(idx) + } +} + +// processLog is invoked to process the application of a single committed log entry. +func (r *Raft) processLog(l *Log, future *logFuture) { + switch l.Type { + case LogBarrier: + // Barrier is handled by the FSM + fallthrough + + case LogCommand: + // Forward to the fsm handler + select { + case r.fsmCommitCh <- commitTuple{l, future}: + case <-r.shutdownCh: + if future != nil { + future.respond(ErrRaftShutdown) + } + } + + // Return so that the future is only responded to + // by the FSM handler when the application is done + return + + case LogConfiguration: + case LogAddPeerDeprecated: + case LogRemovePeerDeprecated: + case LogNoop: + // Ignore the no-op + + default: + panic(fmt.Errorf("unrecognized log type: %#v", l)) + } + + // Invoke the future if given + if future != nil { + future.respond(nil) + } +} + +// processRPC is called to handle an incoming RPC request. This must only be +// called from the main thread. +func (r *Raft) processRPC(rpc RPC) { + if err := r.checkRPCHeader(rpc); err != nil { + rpc.Respond(nil, err) + return + } + + switch cmd := rpc.Command.(type) { + case *AppendEntriesRequest: + r.appendEntries(rpc, cmd) + case *RequestVoteRequest: + r.requestVote(rpc, cmd) + case *InstallSnapshotRequest: + r.installSnapshot(rpc, cmd) + default: + r.logger.Printf("[ERR] raft: Got unexpected command: %#v", rpc.Command) + rpc.Respond(nil, fmt.Errorf("unexpected command")) + } +} + +// processHeartbeat is a special handler used just for heartbeat requests +// so that they can be fast-pathed if a transport supports it. This must only +// be called from the main thread. +func (r *Raft) processHeartbeat(rpc RPC) { + defer metrics.MeasureSince([]string{"raft", "rpc", "processHeartbeat"}, time.Now()) + + // Check if we are shutdown, just ignore the RPC + select { + case <-r.shutdownCh: + return + default: + } + + // Ensure we are only handling a heartbeat + switch cmd := rpc.Command.(type) { + case *AppendEntriesRequest: + r.appendEntries(rpc, cmd) + default: + r.logger.Printf("[ERR] raft: Expected heartbeat, got command: %#v", rpc.Command) + rpc.Respond(nil, fmt.Errorf("unexpected command")) + } +} + +// appendEntries is invoked when we get an append entries RPC call. This must +// only be called from the main thread. +func (r *Raft) appendEntries(rpc RPC, a *AppendEntriesRequest) { + defer metrics.MeasureSince([]string{"raft", "rpc", "appendEntries"}, time.Now()) + // Setup a response + resp := &AppendEntriesResponse{ + RPCHeader: r.getRPCHeader(), + Term: r.getCurrentTerm(), + LastLog: r.getLastIndex(), + Success: false, + NoRetryBackoff: false, + } + var rpcErr error + defer func() { + rpc.Respond(resp, rpcErr) + }() + + // Ignore an older term + if a.Term < r.getCurrentTerm() { + return + } + + // Increase the term if we see a newer one, also transition to follower + // if we ever get an appendEntries call + if a.Term > r.getCurrentTerm() || r.getState() != Follower { + // Ensure transition to follower + r.setState(Follower) + r.setCurrentTerm(a.Term) + resp.Term = a.Term + } + + // Save the current leader + r.setLeader(ServerAddress(r.trans.DecodePeer(a.Leader))) + + // Verify the last log entry + if a.PrevLogEntry > 0 { + lastIdx, lastTerm := r.getLastEntry() + + var prevLogTerm uint64 + if a.PrevLogEntry == lastIdx { + prevLogTerm = lastTerm + + } else { + var prevLog Log + if err := r.logs.GetLog(a.PrevLogEntry, &prevLog); err != nil { + r.logger.Printf("[WARN] raft: Failed to get previous log: %d %v (last: %d)", + a.PrevLogEntry, err, lastIdx) + resp.NoRetryBackoff = true + return + } + prevLogTerm = prevLog.Term + } + + if a.PrevLogTerm != prevLogTerm { + r.logger.Printf("[WARN] raft: Previous log term mis-match: ours: %d remote: %d", + prevLogTerm, a.PrevLogTerm) + resp.NoRetryBackoff = true + return + } + } + + // Process any new entries + if len(a.Entries) > 0 { + start := time.Now() + + // Delete any conflicting entries, skip any duplicates + lastLogIdx, _ := r.getLastLog() + var newEntries []*Log + for i, entry := range a.Entries { + if entry.Index > lastLogIdx { + newEntries = a.Entries[i:] + break + } + var storeEntry Log + if err := r.logs.GetLog(entry.Index, &storeEntry); err != nil { + r.logger.Printf("[WARN] raft: Failed to get log entry %d: %v", + entry.Index, err) + return + } + if entry.Term != storeEntry.Term { + r.logger.Printf("[WARN] raft: Clearing log suffix from %d to %d", entry.Index, lastLogIdx) + if err := r.logs.DeleteRange(entry.Index, lastLogIdx); err != nil { + r.logger.Printf("[ERR] raft: Failed to clear log suffix: %v", err) + return + } + if entry.Index <= r.configurations.latestIndex { + r.configurations.latest = r.configurations.committed + r.configurations.latestIndex = r.configurations.committedIndex + } + newEntries = a.Entries[i:] + break + } + } + + if n := len(newEntries); n > 0 { + // Append the new entries + if err := r.logs.StoreLogs(newEntries); err != nil { + r.logger.Printf("[ERR] raft: Failed to append to logs: %v", err) + // TODO: leaving r.getLastLog() in the wrong + // state if there was a truncation above + return + } + + // Handle any new configuration changes + for _, newEntry := range newEntries { + r.processConfigurationLogEntry(newEntry) + } + + // Update the lastLog + last := newEntries[n-1] + r.setLastLog(last.Index, last.Term) + } + + metrics.MeasureSince([]string{"raft", "rpc", "appendEntries", "storeLogs"}, start) + } + + // Update the commit index + if a.LeaderCommitIndex > 0 && a.LeaderCommitIndex > r.getCommitIndex() { + start := time.Now() + idx := min(a.LeaderCommitIndex, r.getLastIndex()) + r.setCommitIndex(idx) + if r.configurations.latestIndex <= idx { + r.configurations.committed = r.configurations.latest + r.configurations.committedIndex = r.configurations.latestIndex + } + r.processLogs(idx, nil) + metrics.MeasureSince([]string{"raft", "rpc", "appendEntries", "processLogs"}, start) + } + + // Everything went well, set success + resp.Success = true + r.setLastContact() + return +} + +// processConfigurationLogEntry takes a log entry and updates the latest +// configuration if the entry results in a new configuration. This must only be +// called from the main thread, or from NewRaft() before any threads have begun. +func (r *Raft) processConfigurationLogEntry(entry *Log) { + if entry.Type == LogConfiguration { + r.configurations.committed = r.configurations.latest + r.configurations.committedIndex = r.configurations.latestIndex + r.configurations.latest = decodeConfiguration(entry.Data) + r.configurations.latestIndex = entry.Index + } else if entry.Type == LogAddPeerDeprecated || entry.Type == LogRemovePeerDeprecated { + r.configurations.committed = r.configurations.latest + r.configurations.committedIndex = r.configurations.latestIndex + r.configurations.latest = decodePeers(entry.Data, r.trans) + r.configurations.latestIndex = entry.Index + } +} + +// requestVote is invoked when we get an request vote RPC call. +func (r *Raft) requestVote(rpc RPC, req *RequestVoteRequest) { + defer metrics.MeasureSince([]string{"raft", "rpc", "requestVote"}, time.Now()) + r.observe(*req) + + // Setup a response + resp := &RequestVoteResponse{ + RPCHeader: r.getRPCHeader(), + Term: r.getCurrentTerm(), + Granted: false, + } + var rpcErr error + defer func() { + rpc.Respond(resp, rpcErr) + }() + + // Version 0 servers will panic unless the peers is present. It's only + // used on them to produce a warning message. + if r.protocolVersion < 2 { + resp.Peers = encodePeers(r.configurations.latest, r.trans) + } + + // Check if we have an existing leader [who's not the candidate] + candidate := r.trans.DecodePeer(req.Candidate) + if leader := r.Leader(); leader != "" && leader != candidate { + r.logger.Printf("[WARN] raft: Rejecting vote request from %v since we have a leader: %v", + candidate, leader) + return + } + + // Ignore an older term + if req.Term < r.getCurrentTerm() { + return + } + + // Increase the term if we see a newer one + if req.Term > r.getCurrentTerm() { + // Ensure transition to follower + r.setState(Follower) + r.setCurrentTerm(req.Term) + resp.Term = req.Term + } + + // Check if we have voted yet + lastVoteTerm, err := r.stable.GetUint64(keyLastVoteTerm) + if err != nil && err.Error() != "not found" { + r.logger.Printf("[ERR] raft: Failed to get last vote term: %v", err) + return + } + lastVoteCandBytes, err := r.stable.Get(keyLastVoteCand) + if err != nil && err.Error() != "not found" { + r.logger.Printf("[ERR] raft: Failed to get last vote candidate: %v", err) + return + } + + // Check if we've voted in this election before + if lastVoteTerm == req.Term && lastVoteCandBytes != nil { + r.logger.Printf("[INFO] raft: Duplicate RequestVote for same term: %d", req.Term) + if bytes.Compare(lastVoteCandBytes, req.Candidate) == 0 { + r.logger.Printf("[WARN] raft: Duplicate RequestVote from candidate: %s", req.Candidate) + resp.Granted = true + } + return + } + + // Reject if their term is older + lastIdx, lastTerm := r.getLastEntry() + if lastTerm > req.LastLogTerm { + r.logger.Printf("[WARN] raft: Rejecting vote request from %v since our last term is greater (%d, %d)", + candidate, lastTerm, req.LastLogTerm) + return + } + + if lastTerm == req.LastLogTerm && lastIdx > req.LastLogIndex { + r.logger.Printf("[WARN] raft: Rejecting vote request from %v since our last index is greater (%d, %d)", + candidate, lastIdx, req.LastLogIndex) + return + } + + // Persist a vote for safety + if err := r.persistVote(req.Term, req.Candidate); err != nil { + r.logger.Printf("[ERR] raft: Failed to persist vote: %v", err) + return + } + + resp.Granted = true + r.setLastContact() + return +} + +// installSnapshot is invoked when we get a InstallSnapshot RPC call. +// We must be in the follower state for this, since it means we are +// too far behind a leader for log replay. This must only be called +// from the main thread. +func (r *Raft) installSnapshot(rpc RPC, req *InstallSnapshotRequest) { + defer metrics.MeasureSince([]string{"raft", "rpc", "installSnapshot"}, time.Now()) + // Setup a response + resp := &InstallSnapshotResponse{ + Term: r.getCurrentTerm(), + Success: false, + } + var rpcErr error + defer func() { + rpc.Respond(resp, rpcErr) + }() + + // Sanity check the version + if req.SnapshotVersion < SnapshotVersionMin || + req.SnapshotVersion > SnapshotVersionMax { + rpcErr = fmt.Errorf("unsupported snapshot version %d", req.SnapshotVersion) + return + } + + // Ignore an older term + if req.Term < r.getCurrentTerm() { + return + } + + // Increase the term if we see a newer one + if req.Term > r.getCurrentTerm() { + // Ensure transition to follower + r.setState(Follower) + r.setCurrentTerm(req.Term) + resp.Term = req.Term + } + + // Save the current leader + r.setLeader(ServerAddress(r.trans.DecodePeer(req.Leader))) + + // Create a new snapshot + var reqConfiguration Configuration + var reqConfigurationIndex uint64 + if req.SnapshotVersion > 0 { + reqConfiguration = decodeConfiguration(req.Configuration) + reqConfigurationIndex = req.ConfigurationIndex + } else { + reqConfiguration = decodePeers(req.Peers, r.trans) + reqConfigurationIndex = req.LastLogIndex + } + version := getSnapshotVersion(r.protocolVersion) + sink, err := r.snapshots.Create(version, req.LastLogIndex, req.LastLogTerm, + reqConfiguration, reqConfigurationIndex, r.trans) + if err != nil { + r.logger.Printf("[ERR] raft: Failed to create snapshot to install: %v", err) + rpcErr = fmt.Errorf("failed to create snapshot: %v", err) + return + } + + // Spill the remote snapshot to disk + n, err := io.Copy(sink, rpc.Reader) + if err != nil { + sink.Cancel() + r.logger.Printf("[ERR] raft: Failed to copy snapshot: %v", err) + rpcErr = err + return + } + + // Check that we received it all + if n != req.Size { + sink.Cancel() + r.logger.Printf("[ERR] raft: Failed to receive whole snapshot: %d / %d", n, req.Size) + rpcErr = fmt.Errorf("short read") + return + } + + // Finalize the snapshot + if err := sink.Close(); err != nil { + r.logger.Printf("[ERR] raft: Failed to finalize snapshot: %v", err) + rpcErr = err + return + } + r.logger.Printf("[INFO] raft: Copied %d bytes to local snapshot", n) + + // Restore snapshot + future := &restoreFuture{ID: sink.ID()} + future.init() + select { + case r.fsmRestoreCh <- future: + case <-r.shutdownCh: + future.respond(ErrRaftShutdown) + return + } + + // Wait for the restore to happen + if err := future.Error(); err != nil { + r.logger.Printf("[ERR] raft: Failed to restore snapshot: %v", err) + rpcErr = err + return + } + + // Update the lastApplied so we don't replay old logs + r.setLastApplied(req.LastLogIndex) + + // Update the last stable snapshot info + r.setLastSnapshot(req.LastLogIndex, req.LastLogTerm) + + // Restore the peer set + r.configurations.latest = reqConfiguration + r.configurations.latestIndex = reqConfigurationIndex + r.configurations.committed = reqConfiguration + r.configurations.committedIndex = reqConfigurationIndex + + // Compact logs, continue even if this fails + if err := r.compactLogs(req.LastLogIndex); err != nil { + r.logger.Printf("[ERR] raft: Failed to compact logs: %v", err) + } + + r.logger.Printf("[INFO] raft: Installed remote snapshot") + resp.Success = true + r.setLastContact() + return +} + +// setLastContact is used to set the last contact time to now +func (r *Raft) setLastContact() { + r.lastContactLock.Lock() + r.lastContact = time.Now() + r.lastContactLock.Unlock() +} + +type voteResult struct { + RequestVoteResponse + voterID ServerID +} + +// electSelf is used to send a RequestVote RPC to all peers, and vote for +// ourself. This has the side affecting of incrementing the current term. The +// response channel returned is used to wait for all the responses (including a +// vote for ourself). This must only be called from the main thread. +func (r *Raft) electSelf() <-chan *voteResult { + // Create a response channel + respCh := make(chan *voteResult, len(r.configurations.latest.Servers)) + + // Increment the term + r.setCurrentTerm(r.getCurrentTerm() + 1) + + // Construct the request + lastIdx, lastTerm := r.getLastEntry() + req := &RequestVoteRequest{ + RPCHeader: r.getRPCHeader(), + Term: r.getCurrentTerm(), + Candidate: r.trans.EncodePeer(r.localAddr), + LastLogIndex: lastIdx, + LastLogTerm: lastTerm, + } + + // Construct a function to ask for a vote + askPeer := func(peer Server) { + r.goFunc(func() { + defer metrics.MeasureSince([]string{"raft", "candidate", "electSelf"}, time.Now()) + resp := &voteResult{voterID: peer.ID} + err := r.trans.RequestVote(peer.Address, req, &resp.RequestVoteResponse) + if err != nil { + r.logger.Printf("[ERR] raft: Failed to make RequestVote RPC to %v: %v", peer, err) + resp.Term = req.Term + resp.Granted = false + } + respCh <- resp + }) + } + + // For each peer, request a vote + for _, server := range r.configurations.latest.Servers { + if server.Suffrage == Voter { + if server.ID == r.localID { + // Persist a vote for ourselves + if err := r.persistVote(req.Term, req.Candidate); err != nil { + r.logger.Printf("[ERR] raft: Failed to persist vote : %v", err) + return nil + } + // Include our own vote + respCh <- &voteResult{ + RequestVoteResponse: RequestVoteResponse{ + RPCHeader: r.getRPCHeader(), + Term: req.Term, + Granted: true, + }, + voterID: r.localID, + } + } else { + askPeer(server) + } + } + } + + return respCh +} + +// persistVote is used to persist our vote for safety. +func (r *Raft) persistVote(term uint64, candidate []byte) error { + if err := r.stable.SetUint64(keyLastVoteTerm, term); err != nil { + return err + } + if err := r.stable.Set(keyLastVoteCand, candidate); err != nil { + return err + } + return nil +} + +// setCurrentTerm is used to set the current term in a durable manner. +func (r *Raft) setCurrentTerm(t uint64) { + // Persist to disk first + if err := r.stable.SetUint64(keyCurrentTerm, t); err != nil { + panic(fmt.Errorf("failed to save current term: %v", err)) + } + r.raftState.setCurrentTerm(t) +} + +// setState is used to update the current state. Any state +// transition causes the known leader to be cleared. This means +// that leader should be set only after updating the state. +func (r *Raft) setState(state RaftState) { + r.setLeader("") + oldState := r.raftState.getState() + r.raftState.setState(state) + if oldState != state { + r.observe(state) + } +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/replication.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/replication.go new file mode 100644 index 00000000000..68392734397 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/replication.go @@ -0,0 +1,561 @@ +package raft + +import ( + "errors" + "fmt" + "sync" + "time" + + "github.com/armon/go-metrics" +) + +const ( + maxFailureScale = 12 + failureWait = 10 * time.Millisecond +) + +var ( + // ErrLogNotFound indicates a given log entry is not available. + ErrLogNotFound = errors.New("log not found") + + // ErrPipelineReplicationNotSupported can be returned by the transport to + // signal that pipeline replication is not supported in general, and that + // no error message should be produced. + ErrPipelineReplicationNotSupported = errors.New("pipeline replication not supported") +) + +// followerReplication is in charge of sending snapshots and log entries from +// this leader during this particular term to a remote follower. +type followerReplication struct { + // peer contains the network address and ID of the remote follower. + peer Server + + // commitment tracks the entries acknowledged by followers so that the + // leader's commit index can advance. It is updated on successsful + // AppendEntries responses. + commitment *commitment + + // stopCh is notified/closed when this leader steps down or the follower is + // removed from the cluster. In the follower removed case, it carries a log + // index; replication should be attempted with a best effort up through that + // index, before exiting. + stopCh chan uint64 + // triggerCh is notified every time new entries are appended to the log. + triggerCh chan struct{} + + // currentTerm is the term of this leader, to be included in AppendEntries + // requests. + currentTerm uint64 + // nextIndex is the index of the next log entry to send to the follower, + // which may fall past the end of the log. + nextIndex uint64 + + // lastContact is updated to the current time whenever any response is + // received from the follower (successful or not). This is used to check + // whether the leader should step down (Raft.checkLeaderLease()). + lastContact time.Time + // lastContactLock protects 'lastContact'. + lastContactLock sync.RWMutex + + // failures counts the number of failed RPCs since the last success, which is + // used to apply backoff. + failures uint64 + + // notifyCh is notified to send out a heartbeat, which is used to check that + // this server is still leader. + notifyCh chan struct{} + // notify is a list of futures to be resolved upon receipt of an + // acknowledgement, then cleared from this list. + notify []*verifyFuture + // notifyLock protects 'notify'. + notifyLock sync.Mutex + + // stepDown is used to indicate to the leader that we + // should step down based on information from a follower. + stepDown chan struct{} + + // allowPipeline is used to determine when to pipeline the AppendEntries RPCs. + // It is private to this replication goroutine. + allowPipeline bool +} + +// notifyAll is used to notify all the waiting verify futures +// if the follower believes we are still the leader. +func (s *followerReplication) notifyAll(leader bool) { + // Clear the waiting notifies minimizing lock time + s.notifyLock.Lock() + n := s.notify + s.notify = nil + s.notifyLock.Unlock() + + // Submit our votes + for _, v := range n { + v.vote(leader) + } +} + +// LastContact returns the time of last contact. +func (s *followerReplication) LastContact() time.Time { + s.lastContactLock.RLock() + last := s.lastContact + s.lastContactLock.RUnlock() + return last +} + +// setLastContact sets the last contact to the current time. +func (s *followerReplication) setLastContact() { + s.lastContactLock.Lock() + s.lastContact = time.Now() + s.lastContactLock.Unlock() +} + +// replicate is a long running routine that replicates log entries to a single +// follower. +func (r *Raft) replicate(s *followerReplication) { + // Start an async heartbeating routing + stopHeartbeat := make(chan struct{}) + defer close(stopHeartbeat) + r.goFunc(func() { r.heartbeat(s, stopHeartbeat) }) + +RPC: + shouldStop := false + for !shouldStop { + select { + case maxIndex := <-s.stopCh: + // Make a best effort to replicate up to this index + if maxIndex > 0 { + r.replicateTo(s, maxIndex) + } + return + case <-s.triggerCh: + lastLogIdx, _ := r.getLastLog() + shouldStop = r.replicateTo(s, lastLogIdx) + case <-randomTimeout(r.conf.CommitTimeout): // TODO: what is this? + lastLogIdx, _ := r.getLastLog() + shouldStop = r.replicateTo(s, lastLogIdx) + } + + // If things looks healthy, switch to pipeline mode + if !shouldStop && s.allowPipeline { + goto PIPELINE + } + } + return + +PIPELINE: + // Disable until re-enabled + s.allowPipeline = false + + // Replicates using a pipeline for high performance. This method + // is not able to gracefully recover from errors, and so we fall back + // to standard mode on failure. + if err := r.pipelineReplicate(s); err != nil { + if err != ErrPipelineReplicationNotSupported { + r.logger.Printf("[ERR] raft: Failed to start pipeline replication to %s: %s", s.peer, err) + } + } + goto RPC +} + +// replicateTo is a hepler to replicate(), used to replicate the logs up to a +// given last index. +// If the follower log is behind, we take care to bring them up to date. +func (r *Raft) replicateTo(s *followerReplication, lastIndex uint64) (shouldStop bool) { + // Create the base request + var req AppendEntriesRequest + var resp AppendEntriesResponse + var start time.Time +START: + // Prevent an excessive retry rate on errors + if s.failures > 0 { + select { + case <-time.After(backoff(failureWait, s.failures, maxFailureScale)): + case <-r.shutdownCh: + } + } + + // Setup the request + if err := r.setupAppendEntries(s, &req, s.nextIndex, lastIndex); err == ErrLogNotFound { + goto SEND_SNAP + } else if err != nil { + return + } + + // Make the RPC call + start = time.Now() + if err := r.trans.AppendEntries(s.peer.Address, &req, &resp); err != nil { + r.logger.Printf("[ERR] raft: Failed to AppendEntries to %v: %v", s.peer, err) + s.failures++ + return + } + appendStats(string(s.peer.ID), start, float32(len(req.Entries))) + + // Check for a newer term, stop running + if resp.Term > req.Term { + r.handleStaleTerm(s) + return true + } + + // Update the last contact + s.setLastContact() + + // Update s based on success + if resp.Success { + // Update our replication state + updateLastAppended(s, &req) + + // Clear any failures, allow pipelining + s.failures = 0 + s.allowPipeline = true + } else { + s.nextIndex = max(min(s.nextIndex-1, resp.LastLog+1), 1) + if resp.NoRetryBackoff { + s.failures = 0 + } else { + s.failures++ + } + r.logger.Printf("[WARN] raft: AppendEntries to %v rejected, sending older logs (next: %d)", s.peer, s.nextIndex) + } + +CHECK_MORE: + // Poll the stop channel here in case we are looping and have been asked + // to stop, or have stepped down as leader. Even for the best effort case + // where we are asked to replicate to a given index and then shutdown, + // it's better to not loop in here to send lots of entries to a straggler + // that's leaving the cluster anyways. + select { + case <-s.stopCh: + return true + default: + } + + // Check if there are more logs to replicate + if s.nextIndex <= lastIndex { + goto START + } + return + + // SEND_SNAP is used when we fail to get a log, usually because the follower + // is too far behind, and we must ship a snapshot down instead +SEND_SNAP: + if stop, err := r.sendLatestSnapshot(s); stop { + return true + } else if err != nil { + r.logger.Printf("[ERR] raft: Failed to send snapshot to %v: %v", s.peer, err) + return + } + + // Check if there is more to replicate + goto CHECK_MORE +} + +// sendLatestSnapshot is used to send the latest snapshot we have +// down to our follower. +func (r *Raft) sendLatestSnapshot(s *followerReplication) (bool, error) { + // Get the snapshots + snapshots, err := r.snapshots.List() + if err != nil { + r.logger.Printf("[ERR] raft: Failed to list snapshots: %v", err) + return false, err + } + + // Check we have at least a single snapshot + if len(snapshots) == 0 { + return false, fmt.Errorf("no snapshots found") + } + + // Open the most recent snapshot + snapID := snapshots[0].ID + meta, snapshot, err := r.snapshots.Open(snapID) + if err != nil { + r.logger.Printf("[ERR] raft: Failed to open snapshot %v: %v", snapID, err) + return false, err + } + defer snapshot.Close() + + // Setup the request + req := InstallSnapshotRequest{ + RPCHeader: r.getRPCHeader(), + SnapshotVersion: meta.Version, + Term: s.currentTerm, + Leader: r.trans.EncodePeer(r.localAddr), + LastLogIndex: meta.Index, + LastLogTerm: meta.Term, + Peers: meta.Peers, + Size: meta.Size, + Configuration: encodeConfiguration(meta.Configuration), + ConfigurationIndex: meta.ConfigurationIndex, + } + + // Make the call + start := time.Now() + var resp InstallSnapshotResponse + if err := r.trans.InstallSnapshot(s.peer.Address, &req, &resp, snapshot); err != nil { + r.logger.Printf("[ERR] raft: Failed to install snapshot %v: %v", snapID, err) + s.failures++ + return false, err + } + metrics.MeasureSince([]string{"raft", "replication", "installSnapshot", string(s.peer.ID)}, start) + + // Check for a newer term, stop running + if resp.Term > req.Term { + r.handleStaleTerm(s) + return true, nil + } + + // Update the last contact + s.setLastContact() + + // Check for success + if resp.Success { + // Update the indexes + s.nextIndex = meta.Index + 1 + s.commitment.match(s.peer.ID, meta.Index) + + // Clear any failures + s.failures = 0 + + // Notify we are still leader + s.notifyAll(true) + } else { + s.failures++ + r.logger.Printf("[WARN] raft: InstallSnapshot to %v rejected", s.peer) + } + return false, nil +} + +// heartbeat is used to periodically invoke AppendEntries on a peer +// to ensure they don't time out. This is done async of replicate(), +// since that routine could potentially be blocked on disk IO. +func (r *Raft) heartbeat(s *followerReplication, stopCh chan struct{}) { + var failures uint64 + req := AppendEntriesRequest{ + RPCHeader: r.getRPCHeader(), + Term: s.currentTerm, + Leader: r.trans.EncodePeer(r.localAddr), + } + var resp AppendEntriesResponse + for { + // Wait for the next heartbeat interval or forced notify + select { + case <-s.notifyCh: + case <-randomTimeout(r.conf.HeartbeatTimeout / 10): + case <-stopCh: + return + } + + start := time.Now() + if err := r.trans.AppendEntries(s.peer.Address, &req, &resp); err != nil { + r.logger.Printf("[ERR] raft: Failed to heartbeat to %v: %v", s.peer.Address, err) + failures++ + select { + case <-time.After(backoff(failureWait, failures, maxFailureScale)): + case <-stopCh: + } + } else { + s.setLastContact() + failures = 0 + metrics.MeasureSince([]string{"raft", "replication", "heartbeat", string(s.peer.ID)}, start) + s.notifyAll(resp.Success) + } + } +} + +// pipelineReplicate is used when we have synchronized our state with the follower, +// and want to switch to a higher performance pipeline mode of replication. +// We only pipeline AppendEntries commands, and if we ever hit an error, we fall +// back to the standard replication which can handle more complex situations. +func (r *Raft) pipelineReplicate(s *followerReplication) error { + // Create a new pipeline + pipeline, err := r.trans.AppendEntriesPipeline(s.peer.Address) + if err != nil { + return err + } + defer pipeline.Close() + + // Log start and stop of pipeline + r.logger.Printf("[INFO] raft: pipelining replication to peer %v", s.peer) + defer r.logger.Printf("[INFO] raft: aborting pipeline replication to peer %v", s.peer) + + // Create a shutdown and finish channel + stopCh := make(chan struct{}) + finishCh := make(chan struct{}) + + // Start a dedicated decoder + r.goFunc(func() { r.pipelineDecode(s, pipeline, stopCh, finishCh) }) + + // Start pipeline sends at the last good nextIndex + nextIndex := s.nextIndex + + shouldStop := false +SEND: + for !shouldStop { + select { + case <-finishCh: + break SEND + case maxIndex := <-s.stopCh: + // Make a best effort to replicate up to this index + if maxIndex > 0 { + r.pipelineSend(s, pipeline, &nextIndex, maxIndex) + } + break SEND + case <-s.triggerCh: + lastLogIdx, _ := r.getLastLog() + shouldStop = r.pipelineSend(s, pipeline, &nextIndex, lastLogIdx) + case <-randomTimeout(r.conf.CommitTimeout): + lastLogIdx, _ := r.getLastLog() + shouldStop = r.pipelineSend(s, pipeline, &nextIndex, lastLogIdx) + } + } + + // Stop our decoder, and wait for it to finish + close(stopCh) + select { + case <-finishCh: + case <-r.shutdownCh: + } + return nil +} + +// pipelineSend is used to send data over a pipeline. It is a helper to +// pipelineReplicate. +func (r *Raft) pipelineSend(s *followerReplication, p AppendPipeline, nextIdx *uint64, lastIndex uint64) (shouldStop bool) { + // Create a new append request + req := new(AppendEntriesRequest) + if err := r.setupAppendEntries(s, req, *nextIdx, lastIndex); err != nil { + return true + } + + // Pipeline the append entries + if _, err := p.AppendEntries(req, new(AppendEntriesResponse)); err != nil { + r.logger.Printf("[ERR] raft: Failed to pipeline AppendEntries to %v: %v", s.peer, err) + return true + } + + // Increase the next send log to avoid re-sending old logs + if n := len(req.Entries); n > 0 { + last := req.Entries[n-1] + *nextIdx = last.Index + 1 + } + return false +} + +// pipelineDecode is used to decode the responses of pipelined requests. +func (r *Raft) pipelineDecode(s *followerReplication, p AppendPipeline, stopCh, finishCh chan struct{}) { + defer close(finishCh) + respCh := p.Consumer() + for { + select { + case ready := <-respCh: + req, resp := ready.Request(), ready.Response() + appendStats(string(s.peer.ID), ready.Start(), float32(len(req.Entries))) + + // Check for a newer term, stop running + if resp.Term > req.Term { + r.handleStaleTerm(s) + return + } + + // Update the last contact + s.setLastContact() + + // Abort pipeline if not successful + if !resp.Success { + return + } + + // Update our replication state + updateLastAppended(s, req) + case <-stopCh: + return + } + } +} + +// setupAppendEntries is used to setup an append entries request. +func (r *Raft) setupAppendEntries(s *followerReplication, req *AppendEntriesRequest, nextIndex, lastIndex uint64) error { + req.RPCHeader = r.getRPCHeader() + req.Term = s.currentTerm + req.Leader = r.trans.EncodePeer(r.localAddr) + req.LeaderCommitIndex = r.getCommitIndex() + if err := r.setPreviousLog(req, nextIndex); err != nil { + return err + } + if err := r.setNewLogs(req, nextIndex, lastIndex); err != nil { + return err + } + return nil +} + +// setPreviousLog is used to setup the PrevLogEntry and PrevLogTerm for an +// AppendEntriesRequest given the next index to replicate. +func (r *Raft) setPreviousLog(req *AppendEntriesRequest, nextIndex uint64) error { + // Guard for the first index, since there is no 0 log entry + // Guard against the previous index being a snapshot as well + lastSnapIdx, lastSnapTerm := r.getLastSnapshot() + if nextIndex == 1 { + req.PrevLogEntry = 0 + req.PrevLogTerm = 0 + + } else if (nextIndex - 1) == lastSnapIdx { + req.PrevLogEntry = lastSnapIdx + req.PrevLogTerm = lastSnapTerm + + } else { + var l Log + if err := r.logs.GetLog(nextIndex-1, &l); err != nil { + r.logger.Printf("[ERR] raft: Failed to get log at index %d: %v", + nextIndex-1, err) + return err + } + + // Set the previous index and term (0 if nextIndex is 1) + req.PrevLogEntry = l.Index + req.PrevLogTerm = l.Term + } + return nil +} + +// setNewLogs is used to setup the logs which should be appended for a request. +func (r *Raft) setNewLogs(req *AppendEntriesRequest, nextIndex, lastIndex uint64) error { + // Append up to MaxAppendEntries or up to the lastIndex + req.Entries = make([]*Log, 0, r.conf.MaxAppendEntries) + maxIndex := min(nextIndex+uint64(r.conf.MaxAppendEntries)-1, lastIndex) + for i := nextIndex; i <= maxIndex; i++ { + oldLog := new(Log) + if err := r.logs.GetLog(i, oldLog); err != nil { + r.logger.Printf("[ERR] raft: Failed to get log at index %d: %v", i, err) + return err + } + req.Entries = append(req.Entries, oldLog) + } + return nil +} + +// appendStats is used to emit stats about an AppendEntries invocation. +func appendStats(peer string, start time.Time, logs float32) { + metrics.MeasureSince([]string{"raft", "replication", "appendEntries", "rpc", peer}, start) + metrics.IncrCounter([]string{"raft", "replication", "appendEntries", "logs", peer}, logs) +} + +// handleStaleTerm is used when a follower indicates that we have a stale term. +func (r *Raft) handleStaleTerm(s *followerReplication) { + r.logger.Printf("[ERR] raft: peer %v has newer term, stopping replication", s.peer) + s.notifyAll(false) // No longer leader + asyncNotifyCh(s.stepDown) +} + +// updateLastAppended is used to update follower replication state after a +// successful AppendEntries RPC. +// TODO: This isn't used during InstallSnapshot, but the code there is similar. +func updateLastAppended(s *followerReplication, req *AppendEntriesRequest) { + // Mark any inflight logs as committed + if logs := req.Entries; len(logs) > 0 { + last := logs[len(logs)-1] + s.nextIndex = last.Index + 1 + s.commitment.match(s.peer.ID, last.Index) + } + + // Notify still leader + s.notifyAll(true) +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/snapshot.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/snapshot.go new file mode 100644 index 00000000000..8402e093807 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/snapshot.go @@ -0,0 +1,234 @@ +package raft + +import ( + "fmt" + "io" + "time" + + "github.com/armon/go-metrics" +) + +// SnapshotMeta is for metadata of a snapshot. +type SnapshotMeta struct { + // Version is the version number of the snapshot metadata. This does not cover + // the application's data in the snapshot, that should be versioned + // separately. + Version SnapshotVersion + + // ID is opaque to the store, and is used for opening. + ID string + + // Index and Term store when the snapshot was taken. + Index uint64 + Term uint64 + + // Peers is deprecated and used to support version 0 snapshots, but will + // be populated in version 1 snapshots as well to help with upgrades. + Peers []byte + + // Configuration and ConfigurationIndex are present in version 1 + // snapshots and later. + Configuration Configuration + ConfigurationIndex uint64 + + // Size is the size of the snapshot in bytes. + Size int64 +} + +// SnapshotStore interface is used to allow for flexible implementations +// of snapshot storage and retrieval. For example, a client could implement +// a shared state store such as S3, allowing new nodes to restore snapshots +// without streaming from the leader. +type SnapshotStore interface { + // Create is used to begin a snapshot at a given index and term, and with + // the given committed configuration. The version parameter controls + // which snapshot version to create. + Create(version SnapshotVersion, index, term uint64, configuration Configuration, + configurationIndex uint64, trans Transport) (SnapshotSink, error) + + // List is used to list the available snapshots in the store. + // It should return then in descending order, with the highest index first. + List() ([]*SnapshotMeta, error) + + // Open takes a snapshot ID and provides a ReadCloser. Once close is + // called it is assumed the snapshot is no longer needed. + Open(id string) (*SnapshotMeta, io.ReadCloser, error) +} + +// SnapshotSink is returned by StartSnapshot. The FSM will Write state +// to the sink and call Close on completion. On error, Cancel will be invoked. +type SnapshotSink interface { + io.WriteCloser + ID() string + Cancel() error +} + +// runSnapshots is a long running goroutine used to manage taking +// new snapshots of the FSM. It runs in parallel to the FSM and +// main goroutines, so that snapshots do not block normal operation. +func (r *Raft) runSnapshots() { + for { + select { + case <-randomTimeout(r.conf.SnapshotInterval): + // Check if we should snapshot + if !r.shouldSnapshot() { + continue + } + + // Trigger a snapshot + if err := r.takeSnapshot(); err != nil { + r.logger.Printf("[ERR] raft: Failed to take snapshot: %v", err) + } + + case future := <-r.snapshotCh: + // User-triggered, run immediately + err := r.takeSnapshot() + if err != nil { + r.logger.Printf("[ERR] raft: Failed to take snapshot: %v", err) + } + future.respond(err) + + case <-r.shutdownCh: + return + } + } +} + +// shouldSnapshot checks if we meet the conditions to take +// a new snapshot. +func (r *Raft) shouldSnapshot() bool { + // Check the last snapshot index + lastSnap, _ := r.getLastSnapshot() + + // Check the last log index + lastIdx, err := r.logs.LastIndex() + if err != nil { + r.logger.Printf("[ERR] raft: Failed to get last log index: %v", err) + return false + } + + // Compare the delta to the threshold + delta := lastIdx - lastSnap + return delta >= r.conf.SnapshotThreshold +} + +// takeSnapshot is used to take a new snapshot. This must only be called from +// the snapshot thread, never the main thread. +func (r *Raft) takeSnapshot() error { + defer metrics.MeasureSince([]string{"raft", "snapshot", "takeSnapshot"}, time.Now()) + + // Create a request for the FSM to perform a snapshot. + snapReq := &reqSnapshotFuture{} + snapReq.init() + + // Wait for dispatch or shutdown. + select { + case r.fsmSnapshotCh <- snapReq: + case <-r.shutdownCh: + return ErrRaftShutdown + } + + // Wait until we get a response + if err := snapReq.Error(); err != nil { + if err != ErrNothingNewToSnapshot { + err = fmt.Errorf("failed to start snapshot: %v", err) + } + return err + } + defer snapReq.snapshot.Release() + + // Make a request for the configurations and extract the committed info. + // We have to use the future here to safely get this information since + // it is owned by the main thread. + configReq := &configurationsFuture{} + configReq.init() + select { + case r.configurationsCh <- configReq: + case <-r.shutdownCh: + return ErrRaftShutdown + } + if err := configReq.Error(); err != nil { + return err + } + committed := configReq.configurations.committed + committedIndex := configReq.configurations.committedIndex + + // We don't support snapshots while there's a config change outstanding + // since the snapshot doesn't have a means to represent this state. This + // is a little weird because we need the FSM to apply an index that's + // past the configuration change, even though the FSM itself doesn't see + // the configuration changes. It should be ok in practice with normal + // application traffic flowing through the FSM. If there's none of that + // then it's not crucial that we snapshot, since there's not much going + // on Raft-wise. + if snapReq.index < committedIndex { + return fmt.Errorf("cannot take snapshot now, wait until the configuration entry at %v has been applied (have applied %v)", + committedIndex, snapReq.index) + } + + // Create a new snapshot. + r.logger.Printf("[INFO] raft: Starting snapshot up to %d", snapReq.index) + start := time.Now() + version := getSnapshotVersion(r.protocolVersion) + sink, err := r.snapshots.Create(version, snapReq.index, snapReq.term, committed, committedIndex, r.trans) + if err != nil { + return fmt.Errorf("failed to create snapshot: %v", err) + } + metrics.MeasureSince([]string{"raft", "snapshot", "create"}, start) + + // Try to persist the snapshot. + start = time.Now() + if err := snapReq.snapshot.Persist(sink); err != nil { + sink.Cancel() + return fmt.Errorf("failed to persist snapshot: %v", err) + } + metrics.MeasureSince([]string{"raft", "snapshot", "persist"}, start) + + // Close and check for error. + if err := sink.Close(); err != nil { + return fmt.Errorf("failed to close snapshot: %v", err) + } + + // Update the last stable snapshot info. + r.setLastSnapshot(snapReq.index, snapReq.term) + + // Compact the logs. + if err := r.compactLogs(snapReq.index); err != nil { + return err + } + + r.logger.Printf("[INFO] raft: Snapshot to %d complete", snapReq.index) + return nil +} + +// compactLogs takes the last inclusive index of a snapshot +// and trims the logs that are no longer needed. +func (r *Raft) compactLogs(snapIdx uint64) error { + defer metrics.MeasureSince([]string{"raft", "compactLogs"}, time.Now()) + // Determine log ranges to compact + minLog, err := r.logs.FirstIndex() + if err != nil { + return fmt.Errorf("failed to get first log index: %v", err) + } + + // Check if we have enough logs to truncate + lastLogIdx, _ := r.getLastLog() + if lastLogIdx <= r.conf.TrailingLogs { + return nil + } + + // Truncate up to the end of the snapshot, or `TrailingLogs` + // back from the head, which ever is further back. This ensures + // at least `TrailingLogs` entries, but does not allow logs + // after the snapshot to be removed. + maxLog := min(snapIdx, lastLogIdx-r.conf.TrailingLogs) + + // Log this + r.logger.Printf("[INFO] raft: Compacting logs from %d to %d", minLog, maxLog) + + // Compact the logs + if err := r.logs.DeleteRange(minLog, maxLog); err != nil { + return fmt.Errorf("log compaction failed: %v", err) + } + return nil +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/stable.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/stable.go new file mode 100644 index 00000000000..ff59a8c570a --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/stable.go @@ -0,0 +1,15 @@ +package raft + +// StableStore is used to provide stable storage +// of key configurations to ensure safety. +type StableStore interface { + Set(key []byte, val []byte) error + + // Get returns the value for key, or an empty byte slice if key was not found. + Get(key []byte) ([]byte, error) + + SetUint64(key []byte, val uint64) error + + // GetUint64 returns the uint64 value for key, or 0 if key was not found. + GetUint64(key []byte) (uint64, error) +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/state.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/state.go new file mode 100644 index 00000000000..f6d658b8bb4 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/state.go @@ -0,0 +1,167 @@ +package raft + +import ( + "sync" + "sync/atomic" +) + +// RaftState captures the state of a Raft node: Follower, Candidate, Leader, +// or Shutdown. +type RaftState uint32 + +const ( + // Follower is the initial state of a Raft node. + Follower RaftState = iota + + // Candidate is one of the valid states of a Raft node. + Candidate + + // Leader is one of the valid states of a Raft node. + Leader + + // Shutdown is the terminal state of a Raft node. + Shutdown +) + +func (s RaftState) String() string { + switch s { + case Follower: + return "Follower" + case Candidate: + return "Candidate" + case Leader: + return "Leader" + case Shutdown: + return "Shutdown" + default: + return "Unknown" + } +} + +// raftState is used to maintain various state variables +// and provides an interface to set/get the variables in a +// thread safe manner. +type raftState struct { + // The current term, cache of StableStore + currentTerm uint64 + + // Highest committed log entry + commitIndex uint64 + + // Last applied log to the FSM + lastApplied uint64 + + // protects 4 next fields + lastLock sync.Mutex + + // Cache the latest snapshot index/term + lastSnapshotIndex uint64 + lastSnapshotTerm uint64 + + // Cache the latest log from LogStore + lastLogIndex uint64 + lastLogTerm uint64 + + // Tracks running goroutines + routinesGroup sync.WaitGroup + + // The current state + state RaftState +} + +func (r *raftState) getState() RaftState { + stateAddr := (*uint32)(&r.state) + return RaftState(atomic.LoadUint32(stateAddr)) +} + +func (r *raftState) setState(s RaftState) { + stateAddr := (*uint32)(&r.state) + atomic.StoreUint32(stateAddr, uint32(s)) +} + +func (r *raftState) getCurrentTerm() uint64 { + return atomic.LoadUint64(&r.currentTerm) +} + +func (r *raftState) setCurrentTerm(term uint64) { + atomic.StoreUint64(&r.currentTerm, term) +} + +func (r *raftState) getLastLog() (index, term uint64) { + r.lastLock.Lock() + index = r.lastLogIndex + term = r.lastLogTerm + r.lastLock.Unlock() + return +} + +func (r *raftState) setLastLog(index, term uint64) { + r.lastLock.Lock() + r.lastLogIndex = index + r.lastLogTerm = term + r.lastLock.Unlock() +} + +func (r *raftState) getLastSnapshot() (index, term uint64) { + r.lastLock.Lock() + index = r.lastSnapshotIndex + term = r.lastSnapshotTerm + r.lastLock.Unlock() + return +} + +func (r *raftState) setLastSnapshot(index, term uint64) { + r.lastLock.Lock() + r.lastSnapshotIndex = index + r.lastSnapshotTerm = term + r.lastLock.Unlock() +} + +func (r *raftState) getCommitIndex() uint64 { + return atomic.LoadUint64(&r.commitIndex) +} + +func (r *raftState) setCommitIndex(index uint64) { + atomic.StoreUint64(&r.commitIndex, index) +} + +func (r *raftState) getLastApplied() uint64 { + return atomic.LoadUint64(&r.lastApplied) +} + +func (r *raftState) setLastApplied(index uint64) { + atomic.StoreUint64(&r.lastApplied, index) +} + +// Start a goroutine and properly handle the race between a routine +// starting and incrementing, and exiting and decrementing. +func (r *raftState) goFunc(f func()) { + r.routinesGroup.Add(1) + go func() { + defer r.routinesGroup.Done() + f() + }() +} + +func (r *raftState) waitShutdown() { + r.routinesGroup.Wait() +} + +// getLastIndex returns the last index in stable storage. +// Either from the last log or from the last snapshot. +func (r *raftState) getLastIndex() uint64 { + r.lastLock.Lock() + defer r.lastLock.Unlock() + return max(r.lastLogIndex, r.lastSnapshotIndex) +} + +// getLastEntry returns the last index and term in stable storage. +// Either from the last log or from the last snapshot. +func (r *raftState) getLastEntry() (uint64, uint64) { + r.lastLock.Lock() + defer r.lastLock.Unlock() + if r.lastLogIndex >= r.lastSnapshotIndex { + return r.lastLogIndex, r.lastLogTerm + } + return r.lastSnapshotIndex, r.lastSnapshotTerm +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/tcp_transport.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/tcp_transport.go new file mode 100644 index 00000000000..9281508a050 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/tcp_transport.go @@ -0,0 +1,105 @@ +package raft + +import ( + "errors" + "io" + "log" + "net" + "time" +) + +var ( + errNotAdvertisable = errors.New("local bind address is not advertisable") + errNotTCP = errors.New("local address is not a TCP address") +) + +// TCPStreamLayer implements StreamLayer interface for plain TCP. +type TCPStreamLayer struct { + advertise net.Addr + listener *net.TCPListener +} + +// NewTCPTransport returns a NetworkTransport that is built on top of +// a TCP streaming transport layer. +func NewTCPTransport( + bindAddr string, + advertise net.Addr, + maxPool int, + timeout time.Duration, + logOutput io.Writer, +) (*NetworkTransport, error) { + return newTCPTransport(bindAddr, advertise, maxPool, timeout, func(stream StreamLayer) *NetworkTransport { + return NewNetworkTransport(stream, maxPool, timeout, logOutput) + }) +} + +// NewTCPTransportWithLogger returns a NetworkTransport that is built on top of +// a TCP streaming transport layer, with log output going to the supplied Logger +func NewTCPTransportWithLogger( + bindAddr string, + advertise net.Addr, + maxPool int, + timeout time.Duration, + logger *log.Logger, +) (*NetworkTransport, error) { + return newTCPTransport(bindAddr, advertise, maxPool, timeout, func(stream StreamLayer) *NetworkTransport { + return NewNetworkTransportWithLogger(stream, maxPool, timeout, logger) + }) +} + +func newTCPTransport(bindAddr string, + advertise net.Addr, + maxPool int, + timeout time.Duration, + transportCreator func(stream StreamLayer) *NetworkTransport) (*NetworkTransport, error) { + // Try to bind + list, err := net.Listen("tcp", bindAddr) + if err != nil { + return nil, err + } + + // Create stream + stream := &TCPStreamLayer{ + advertise: advertise, + listener: list.(*net.TCPListener), + } + + // Verify that we have a usable advertise address + addr, ok := stream.Addr().(*net.TCPAddr) + if !ok { + list.Close() + return nil, errNotTCP + } + if addr.IP.IsUnspecified() { + list.Close() + return nil, errNotAdvertisable + } + + // Create the network transport + trans := transportCreator(stream) + return trans, nil +} + +// Dial implements the StreamLayer interface. +func (t *TCPStreamLayer) Dial(address ServerAddress, timeout time.Duration) (net.Conn, error) { + return net.DialTimeout("tcp", string(address), timeout) +} + +// Accept implements the net.Listener interface. +func (t *TCPStreamLayer) Accept() (c net.Conn, err error) { + return t.listener.Accept() +} + +// Close implements the net.Listener interface. +func (t *TCPStreamLayer) Close() (err error) { + return t.listener.Close() +} + +// Addr implements the net.Listener interface. +func (t *TCPStreamLayer) Addr() net.Addr { + // Use an advertise addr if provided + if t.advertise != nil { + return t.advertise + } + return t.listener.Addr() +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/transport.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/transport.go new file mode 100644 index 00000000000..633f97a8c5c --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/transport.go @@ -0,0 +1,124 @@ +package raft + +import ( + "io" + "time" +) + +// RPCResponse captures both a response and a potential error. +type RPCResponse struct { + Response interface{} + Error error +} + +// RPC has a command, and provides a response mechanism. +type RPC struct { + Command interface{} + Reader io.Reader // Set only for InstallSnapshot + RespChan chan<- RPCResponse +} + +// Respond is used to respond with a response, error or both +func (r *RPC) Respond(resp interface{}, err error) { + r.RespChan <- RPCResponse{resp, err} +} + +// Transport provides an interface for network transports +// to allow Raft to communicate with other nodes. +type Transport interface { + // Consumer returns a channel that can be used to + // consume and respond to RPC requests. + Consumer() <-chan RPC + + // LocalAddr is used to return our local address to distinguish from our peers. + LocalAddr() ServerAddress + + // AppendEntriesPipeline returns an interface that can be used to pipeline + // AppendEntries requests. + AppendEntriesPipeline(target ServerAddress) (AppendPipeline, error) + + // AppendEntries sends the appropriate RPC to the target node. + AppendEntries(target ServerAddress, args *AppendEntriesRequest, resp *AppendEntriesResponse) error + + // RequestVote sends the appropriate RPC to the target node. + RequestVote(target ServerAddress, args *RequestVoteRequest, resp *RequestVoteResponse) error + + // InstallSnapshot is used to push a snapshot down to a follower. The data is read from + // the ReadCloser and streamed to the client. + InstallSnapshot(target ServerAddress, args *InstallSnapshotRequest, resp *InstallSnapshotResponse, data io.Reader) error + + // EncodePeer is used to serialize a peer's address. + EncodePeer(ServerAddress) []byte + + // DecodePeer is used to deserialize a peer's address. + DecodePeer([]byte) ServerAddress + + // SetHeartbeatHandler is used to setup a heartbeat handler + // as a fast-pass. This is to avoid head-of-line blocking from + // disk IO. If a Transport does not support this, it can simply + // ignore the call, and push the heartbeat onto the Consumer channel. + SetHeartbeatHandler(cb func(rpc RPC)) +} + +// WithClose is an interface that a transport may provide which +// allows a transport to be shut down cleanly when a Raft instance +// shuts down. +// +// It is defined separately from Transport as unfortunately it wasn't in the +// original interface specification. +type WithClose interface { + // Close permanently closes a transport, stopping + // any associated goroutines and freeing other resources. + Close() error +} + +// LoopbackTransport is an interface that provides a loopback transport suitable for testing +// e.g. InmemTransport. It's there so we don't have to rewrite tests. +type LoopbackTransport interface { + Transport // Embedded transport reference + WithPeers // Embedded peer management + WithClose // with a close routine +} + +// WithPeers is an interface that a transport may provide which allows for connection and +// disconnection. Unless the transport is a loopback transport, the transport specified to +// "Connect" is likely to be nil. +type WithPeers interface { + Connect(peer ServerAddress, t Transport) // Connect a peer + Disconnect(peer ServerAddress) // Disconnect a given peer + DisconnectAll() // Disconnect all peers, possibly to reconnect them later +} + +// AppendPipeline is used for pipelining AppendEntries requests. It is used +// to increase the replication throughput by masking latency and better +// utilizing bandwidth. +type AppendPipeline interface { + // AppendEntries is used to add another request to the pipeline. + // The send may block which is an effective form of back-pressure. + AppendEntries(args *AppendEntriesRequest, resp *AppendEntriesResponse) (AppendFuture, error) + + // Consumer returns a channel that can be used to consume + // response futures when they are ready. + Consumer() <-chan AppendFuture + + // Close closes the pipeline and cancels all inflight RPCs + Close() error +} + +// AppendFuture is used to return information about a pipelined AppendEntries request. +type AppendFuture interface { + Future + + // Start returns the time that the append request was started. + // It is always OK to call this method. + Start() time.Time + + // Request holds the parameters of the AppendEntries call. + // It is always OK to call this method. + Request() *AppendEntriesRequest + + // Response holds the results of the AppendEntries call. + // This method must only be called after the Error + // method returns, and will only be valid on success. + Response() *AppendEntriesResponse +} diff --git a/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/util.go b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/util.go new file mode 100644 index 00000000000..90428d7437e --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/github.com/hashicorp/raft/util.go @@ -0,0 +1,133 @@ +package raft + +import ( + "bytes" + crand "crypto/rand" + "fmt" + "math" + "math/big" + "math/rand" + "time" + + "github.com/hashicorp/go-msgpack/codec" +) + +func init() { + // Ensure we use a high-entropy seed for the psuedo-random generator + rand.Seed(newSeed()) +} + +// returns an int64 from a crypto random source +// can be used to seed a source for a math/rand. +func newSeed() int64 { + r, err := crand.Int(crand.Reader, big.NewInt(math.MaxInt64)) + if err != nil { + panic(fmt.Errorf("failed to read random bytes: %v", err)) + } + return r.Int64() +} + +// randomTimeout returns a value that is between the minVal and 2x minVal. +func randomTimeout(minVal time.Duration) <-chan time.Time { + if minVal == 0 { + return nil + } + extra := (time.Duration(rand.Int63()) % minVal) + return time.After(minVal + extra) +} + +// min returns the minimum. +func min(a, b uint64) uint64 { + if a <= b { + return a + } + return b +} + +// max returns the maximum. +func max(a, b uint64) uint64 { + if a >= b { + return a + } + return b +} + +// generateUUID is used to generate a random UUID. +func generateUUID() string { + buf := make([]byte, 16) + if _, err := crand.Read(buf); err != nil { + panic(fmt.Errorf("failed to read random bytes: %v", err)) + } + + return fmt.Sprintf("%08x-%04x-%04x-%04x-%12x", + buf[0:4], + buf[4:6], + buf[6:8], + buf[8:10], + buf[10:16]) +} + +// asyncNotifyCh is used to do an async channel send +// to a single channel without blocking. +func asyncNotifyCh(ch chan struct{}) { + select { + case ch <- struct{}{}: + default: + } +} + +// drainNotifyCh empties out a single-item notification channel without +// blocking, and returns whether it received anything. +func drainNotifyCh(ch chan struct{}) bool { + select { + case <-ch: + return true + default: + return false + } +} + +// asyncNotifyBool is used to do an async notification +// on a bool channel. +func asyncNotifyBool(ch chan bool, v bool) { + select { + case ch <- v: + default: + } +} + +// Decode reverses the encode operation on a byte slice input. +func decodeMsgPack(buf []byte, out interface{}) error { + r := bytes.NewBuffer(buf) + hd := codec.MsgpackHandle{} + dec := codec.NewDecoder(r, &hd) + return dec.Decode(out) +} + +// Encode writes an encoded object to a new bytes buffer. +func encodeMsgPack(in interface{}) (*bytes.Buffer, error) { + buf := bytes.NewBuffer(nil) + hd := codec.MsgpackHandle{} + enc := codec.NewEncoder(buf, &hd) + err := enc.Encode(in) + return buf, err +} + +// backoff is used to compute an exponential backoff +// duration. Base time is scaled by the current round, +// up to some maximum scale factor. +func backoff(base time.Duration, round, limit uint64) time.Duration { + power := min(round, limit) + for power > 2 { + base *= 2 + power-- + } + return base +} + +// Needed for sorting []uint64, used to determine commitment +type uint64Slice []uint64 + +func (p uint64Slice) Len() int { return len(p) } +func (p uint64Slice) Less(i, j int) bool { return p[i] < p[j] } +func (p uint64Slice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } diff --git a/vendor/github.com/hashicorp/consul/vendor/vendor.json b/vendor/github.com/hashicorp/consul/vendor/vendor.json new file mode 100644 index 00000000000..1d0e61fefa5 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/vendor/vendor.json @@ -0,0 +1,471 @@ +{ + "comment": "", + "ignore": "test", + "package": [ + { + "path": "context", + "revision": "" + }, + { + "checksumSHA1": "JhyS/zIicgtrSasHSZ6WtXGWJVk=", + "path": "github.com/DataDog/datadog-go/statsd", + "revision": "cc2f4770f4d61871e19bfee967bc767fe730b0d9", + "revisionTime": "2016-03-29T13:52:53Z" + }, + { + "checksumSHA1": "yXrS4e4yxaRBu/MX+39X3cS4kbg=", + "path": "github.com/Sirupsen/logrus", + "revision": "a283a10442df8dc09befd873fab202bf8a253d6a", + "revisionTime": "2016-07-16T02:56:31Z" + }, + { + "checksumSHA1": "l0iFqayYAaEip6Olaq3/LCOa/Sg=", + "path": "github.com/armon/circbuf", + "revision": "bbbad097214e2918d8543d5201d12bfd7bca254d", + "revisionTime": "2015-08-27T00:49:46Z" + }, + { + "checksumSHA1": "d6798KSc0jDg2MHNxKdgyNfMK7A=", + "path": "github.com/armon/go-metrics", + "revision": "3df31a1ada83e310c2e24b267c8e8b68836547b4", + "revisionTime": "2016-07-17T04:34:58Z" + }, + { + "checksumSHA1": "OmqT9Y1mAHvlAKeJh0jBHC9SH78=", + "path": "github.com/armon/go-metrics/circonus", + "revision": "3df31a1ada83e310c2e24b267c8e8b68836547b4", + "revisionTime": "2016-07-17T04:34:58Z" + }, + { + "checksumSHA1": "mAzNU3zeZGEwqjDT4ZkspFvx3TI=", + "path": "github.com/armon/go-metrics/datadog", + "revision": "3df31a1ada83e310c2e24b267c8e8b68836547b4", + "revisionTime": "2016-07-17T04:34:58Z" + }, + { + "checksumSHA1": "gNO0JNpLzYOdInGeq7HqMZUzx9M=", + "path": "github.com/armon/go-radix", + "revision": "4239b77079c7b5d1243b7b4736304ce8ddb6f0f2", + "revisionTime": "2016-01-15T23:47:25Z" + }, + { + "checksumSHA1": "dvd7Su+WNmHRP1+w1HezrPUCDsc=", + "path": "github.com/bgentry/speakeasy", + "revision": "e1439544d8ecd0f3e9373a636d447668096a8f81", + "revisionTime": "2016-05-20T23:26:10Z" + }, + { + "checksumSHA1": "twtRfb6484vfr2qqjiFkLThTjcQ=", + "path": "github.com/bgentry/speakeasy/example", + "revision": "e1439544d8ecd0f3e9373a636d447668096a8f81", + "revisionTime": "2016-05-20T23:26:10Z" + }, + { + "checksumSHA1": "C1BwfmTAUmldO4V3++tvYWzT49Y=", + "comment": "v1.2.1", + "path": "github.com/boltdb/bolt", + "revision": "dfb21201d9270c1082d5fb0f07f500311ff72f18", + "revisionTime": "2016-05-16T15:40:46Z", + "version": "v1.2.1", + "versionExact": "v1.2.1" + }, + { + "checksumSHA1": "b5zgHT9TxBAVh/KP9kQi7QVoz9w=", + "path": "github.com/circonus-labs/circonus-gometrics", + "revision": "a7c30e0dcc6e2341053132470dcedc12bc7705ef", + "revisionTime": "2016-07-22T17:27:10Z" + }, + { + "checksumSHA1": "IFiYTxu8jshL4A8BCttUaDhp1m4=", + "path": "github.com/circonus-labs/circonus-gometrics/api", + "revision": "a7c30e0dcc6e2341053132470dcedc12bc7705ef", + "revisionTime": "2016-07-22T17:27:10Z" + }, + { + "checksumSHA1": "+9vcRzlTdvEjH/Uf8fKC5MXdjNw=", + "path": "github.com/circonus-labs/circonus-gometrics/checkmgr", + "revision": "a7c30e0dcc6e2341053132470dcedc12bc7705ef", + "revisionTime": "2016-07-22T17:27:10Z" + }, + { + "checksumSHA1": "C4Z7+l5GOpOCW5DcvNYzheGvQRE=", + "path": "github.com/circonus-labs/circonusllhist", + "revision": "d724266ae5270ae8b87a5d2e8081f04e307c3c18", + "revisionTime": "2016-05-26T04:38:13Z" + }, + { + "checksumSHA1": "fXAinpJ5bOcborK7AiO1rnW60BI=", + "path": "github.com/docker/docker/opts", + "revision": "eb28dde01f165849bf372e18200e83042c76f26c", + "revisionTime": "2016-08-09T18:56:09Z" + }, + { + "checksumSHA1": "Kw89nXWMfcNR0KcRw3T8bmuaQlw=", + "path": "github.com/docker/docker/pkg/archive", + "revision": "eb28dde01f165849bf372e18200e83042c76f26c", + "revisionTime": "2016-08-09T18:56:09Z" + }, + { + "checksumSHA1": "OFqhPMitAwvJXnD68U2ZVwKhsTA=", + "path": "github.com/docker/docker/pkg/fileutils", + "revision": "eb28dde01f165849bf372e18200e83042c76f26c", + "revisionTime": "2016-08-09T18:56:09Z" + }, + { + "checksumSHA1": "p6Ud4Yf1ywWy20YxXF1RU4yhTio=", + "path": "github.com/docker/docker/pkg/homedir", + "revision": "eb28dde01f165849bf372e18200e83042c76f26c", + "revisionTime": "2016-08-09T18:56:09Z" + }, + { + "checksumSHA1": "iP5slJJPRZUm0rfdII8OiATAACA=", + "path": "github.com/docker/docker/pkg/idtools", + "revision": "eb28dde01f165849bf372e18200e83042c76f26c", + "revisionTime": "2016-08-09T18:56:09Z" + }, + { + "checksumSHA1": "QDpwKJCKQV6+0o0kkPN4SC5Aldc=", + "path": "github.com/docker/docker/pkg/ioutils", + "revision": "eb28dde01f165849bf372e18200e83042c76f26c", + "revisionTime": "2016-08-09T18:56:09Z" + }, + { + "checksumSHA1": "ndnAFCfsGC3upNQ6jAEwzxcurww=", + "path": "github.com/docker/docker/pkg/longpath", + "revision": "eb28dde01f165849bf372e18200e83042c76f26c", + "revisionTime": "2016-08-09T18:56:09Z" + }, + { + "checksumSHA1": "l4lJIW9JIddsKRqsoqRt9zXu/7M=", + "path": "github.com/docker/docker/pkg/pools", + "revision": "eb28dde01f165849bf372e18200e83042c76f26c", + "revisionTime": "2016-08-09T18:56:09Z" + }, + { + "checksumSHA1": "txf3EORYff4hO6PEvwBm2lyh1MU=", + "path": "github.com/docker/docker/pkg/promise", + "revision": "eb28dde01f165849bf372e18200e83042c76f26c", + "revisionTime": "2016-08-09T18:56:09Z" + }, + { + "checksumSHA1": "YDYbS5U2mDwfcOUJ6M09cP6Bubg=", + "path": "github.com/docker/docker/pkg/stdcopy", + "revision": "eb28dde01f165849bf372e18200e83042c76f26c", + "revisionTime": "2016-08-09T18:56:09Z" + }, + { + "checksumSHA1": "Eh3iu/9RzHzNY4vHHPKaZISAgBo=", + "path": "github.com/docker/docker/pkg/system", + "revision": "eb28dde01f165849bf372e18200e83042c76f26c", + "revisionTime": "2016-08-09T18:56:09Z" + }, + { + "checksumSHA1": "hZV62Xzt/i0e/WBKWww3rpkRAR4=", + "path": "github.com/docker/engine-api/types/filters", + "revision": "fc564829f64e3a820d7b896611ede92c8744d752", + "revisionTime": "2016-08-09T18:19:22Z" + }, + { + "checksumSHA1": "AE3TTpPWvv9ic71FiF0HnRr46mE=", + "path": "github.com/docker/engine-api/types/versions", + "revision": "fc564829f64e3a820d7b896611ede92c8744d752", + "revisionTime": "2016-08-09T18:19:22Z" + }, + { + "checksumSHA1": "KbWP8VsU9gVoZm9pCqe79AkfDmk=", + "path": "github.com/docker/go-units", + "revision": "eb879ae3e2b84e2a142af415b679ddeda47ec71c", + "revisionTime": "2016-08-02T14:55:05Z" + }, + { + "checksumSHA1": "5ftkjfUwI9A6xCQ1PwIAd5+qlo0=", + "path": "github.com/elazarl/go-bindata-assetfs", + "revision": "e1a2a7ec64b07d04ac9ebb072404fe8b7b60de1b", + "revisionTime": "2016-08-03T19:23:04Z" + }, + { + "checksumSHA1": "CaThXbumVxZtNlItiXma5B78PwQ=", + "path": "github.com/elazarl/go-bindata-assetfs/go-bindata-assetfs", + "revision": "e1a2a7ec64b07d04ac9ebb072404fe8b7b60de1b", + "revisionTime": "2016-08-03T19:23:04Z" + }, + { + "checksumSHA1": "U+z+QQ323fAeBtBS17ztWmyVI3Q=", + "path": "github.com/fsouza/go-dockerclient", + "revision": "a53ba79627e888ef775bdcf15813f07d7a232867", + "revisionTime": "2016-08-09T01:24:47Z" + }, + { + "checksumSHA1": "cdOCt0Yb+hdErz8NAQqayxPmRsY=", + "path": "github.com/hashicorp/errwrap", + "revision": "7554cd9344cec97297fa6649b055a8c98c2a1e55", + "revisionTime": "2014-10-28T05:47:10Z" + }, + { + "checksumSHA1": "nd3S1qkFv7zZxA9be0bw4nT0pe0=", + "path": "github.com/hashicorp/go-checkpoint", + "revision": "e4b2dc34c0f698ee04750bf2035d8b9384233e1b", + "revisionTime": "2015-10-22T18:15:14Z" + }, + { + "checksumSHA1": "Uzyon2091lmwacNsl1hCytjhHtg=", + "path": "github.com/hashicorp/go-cleanhttp", + "revision": "ad28ea4487f05916463e2423a55166280e8254b5", + "revisionTime": "2016-04-07T17:41:26Z" + }, + { + "checksumSHA1": "qmE9mO0WW6ALLpUU81rXDyspP5M=", + "path": "github.com/hashicorp/go-immutable-radix", + "revision": "afc5a0dbb18abdf82c277a7bc01533e81fa1d6b8", + "revisionTime": "2016-06-09T02:05:29Z" + }, + { + "checksumSHA1": "/V57CyN7x2NUlHoOzVL5GgGXX84=", + "path": "github.com/hashicorp/go-memdb", + "revision": "98f52f52d7a476958fa9da671354d270c50661a7", + "revisionTime": "2016-03-01T23:01:42Z" + }, + { + "path": "github.com/hashicorp/go-msgpack/codec", + "revision": "fa3f63826f7c23912c15263591e65d54d080b458" + }, + { + "checksumSHA1": "lrSl49G23l6NhfilxPM0XFs5rZo=", + "path": "github.com/hashicorp/go-multierror", + "revision": "d30f09973e19c1dfcd120b2d9c4f168e68d6b5d5", + "revisionTime": "2015-09-16T20:57:42Z" + }, + { + "checksumSHA1": "7qEpaAJA78EvQDs16x9mYOWfgqo=", + "path": "github.com/hashicorp/go-reap", + "revision": "2d85522212dcf5a84c6b357094f5c44710441912", + "revisionTime": "2016-01-13T17:25:55Z" + }, + { + "checksumSHA1": "bfVGm7xZ2VFpddJp3KEPUZ8Y9Po=", + "path": "github.com/hashicorp/go-retryablehttp", + "revision": "886ce0458bc81ccca0fb7044c1be0e9ab590bed7", + "revisionTime": "2016-07-18T23:34:41Z" + }, + { + "checksumSHA1": "xZ7Ban1x//6uUIU1xtrTbCYNHBc=", + "path": "github.com/hashicorp/go-syslog", + "revision": "42a2b573b664dbf281bd48c3cc12c086b17a39ba", + "revisionTime": "2015-02-18T18:19:46Z" + }, + { + "checksumSHA1": "mAkPa/RLuIwN53GbwIEMATexams=", + "path": "github.com/hashicorp/go-uuid", + "revision": "64130c7a86d732268a38cb04cfbaf0cc987fda98", + "revisionTime": "2016-07-17T02:21:40Z" + }, + { + "checksumSHA1": "d9PxF1XQGLMJZRct2R8qVM/eYlE=", + "path": "github.com/hashicorp/golang-lru", + "revision": "a0d98a5f288019575c6d1f4bb1573fef2d1fcdc4", + "revisionTime": "2016-02-07T21:47:19Z" + }, + { + "checksumSHA1": "2nOpYjx8Sn57bqlZq17yM4YJuM4=", + "path": "github.com/hashicorp/golang-lru/simplelru", + "revision": "a0d98a5f288019575c6d1f4bb1573fef2d1fcdc4", + "revisionTime": "2016-02-07T21:47:19Z" + }, + { + "checksumSHA1": "ydHBPi04mEh+Tir+2JkpSIMckcw=", + "path": "github.com/hashicorp/hcl", + "revision": "d8c773c4cba11b11539e3d45f93daeaa5dcf1fa1", + "revisionTime": "2016-07-11T23:17:52Z" + }, + { + "checksumSHA1": "IxyvRpCFeoJBGl2obLKJV7RCGjg=", + "path": "github.com/hashicorp/hcl/hcl/ast", + "revision": "d8c773c4cba11b11539e3d45f93daeaa5dcf1fa1", + "revisionTime": "2016-07-11T23:17:52Z" + }, + { + "checksumSHA1": "l2oQxBsZRwn6eZjf+whXr8c9+8c=", + "path": "github.com/hashicorp/hcl/hcl/parser", + "revision": "d8c773c4cba11b11539e3d45f93daeaa5dcf1fa1", + "revisionTime": "2016-07-11T23:17:52Z" + }, + { + "checksumSHA1": "vjhDQVlgHhdxml1V8/cj0vOe+j8=", + "path": "github.com/hashicorp/hcl/hcl/scanner", + "revision": "d8c773c4cba11b11539e3d45f93daeaa5dcf1fa1", + "revisionTime": "2016-07-11T23:17:52Z" + }, + { + "checksumSHA1": "JlZmnzqdmFFyb1+2afLyR3BOE/8=", + "path": "github.com/hashicorp/hcl/hcl/strconv", + "revision": "d8c773c4cba11b11539e3d45f93daeaa5dcf1fa1", + "revisionTime": "2016-07-11T23:17:52Z" + }, + { + "checksumSHA1": "c6yprzj06ASwCo18TtbbNNBHljA=", + "path": "github.com/hashicorp/hcl/hcl/token", + "revision": "d8c773c4cba11b11539e3d45f93daeaa5dcf1fa1", + "revisionTime": "2016-07-11T23:17:52Z" + }, + { + "checksumSHA1": "jQ45CCc1ed/nlV7bbSnx6z72q1M=", + "path": "github.com/hashicorp/hcl/json/parser", + "revision": "d8c773c4cba11b11539e3d45f93daeaa5dcf1fa1", + "revisionTime": "2016-07-11T23:17:52Z" + }, + { + "checksumSHA1": "S1e0F9ZKSnqgOLfjDTYazRL28tA=", + "path": "github.com/hashicorp/hcl/json/scanner", + "revision": "d8c773c4cba11b11539e3d45f93daeaa5dcf1fa1", + "revisionTime": "2016-07-11T23:17:52Z" + }, + { + "checksumSHA1": "fNlXQCQEnb+B3k5UDL/r15xtSJY=", + "path": "github.com/hashicorp/hcl/json/token", + "revision": "d8c773c4cba11b11539e3d45f93daeaa5dcf1fa1", + "revisionTime": "2016-07-11T23:17:52Z" + }, + { + "checksumSHA1": "kqCMCHy2b+RBMKC+ER+OPqp8C3E=", + "path": "github.com/hashicorp/hil", + "revision": "1e86c6b523c55d1fa6c6e930ce80b548664c95c2", + "revisionTime": "2016-07-11T23:18:37Z" + }, + { + "checksumSHA1": "UICubs001+Q4MsUf9zl2vcMzWQQ=", + "path": "github.com/hashicorp/hil/ast", + "revision": "1e86c6b523c55d1fa6c6e930ce80b548664c95c2", + "revisionTime": "2016-07-11T23:18:37Z" + }, + { + "checksumSHA1": "vt+P9D2yWDO3gdvdgCzwqunlhxU=", + "path": "github.com/hashicorp/logutils", + "revision": "0dc08b1671f34c4250ce212759ebd880f743d883", + "revisionTime": "2015-06-09T07:04:31Z" + }, + { + "checksumSHA1": "AY1/cRsuWpoJMG0J821TqFo9nDE=", + "path": "github.com/hashicorp/memberlist", + "revision": "0c5ba075f8520c65572f001331a1a43b756e01d7", + "revisionTime": "2016-08-12T18:27:57Z" + }, + { + "checksumSHA1": "qnlqWJYV81ENr61SZk9c65R1mDo=", + "path": "github.com/hashicorp/net-rpc-msgpackrpc", + "revision": "a14192a58a694c123d8fe5481d4a4727d6ae82f3", + "revisionTime": "2015-11-16T02:03:38Z" + }, + { + "checksumSHA1": "zMgiTV0dfJIQNRCJDF50bLomDvg=", + "path": "github.com/hashicorp/raft", + "revision": "c69c15dd73b6695ba75b3502ce6b332cc0042c83", + "revisionTime": "2016-08-01T21:27:18Z" + }, + { + "checksumSHA1": "QAxukkv54/iIvLfsUP6IK4R0m/A=", + "path": "github.com/hashicorp/raft-boltdb", + "revision": "d1e82c1ec3f15ee991f7cc7ffd5b67ff6f5bbaee", + "revisionTime": "2015-02-01T20:08:39Z" + }, + { + "checksumSHA1": "u9qHbpIgMZ7/fjO0gFfds2m/1ck=", + "path": "github.com/hashicorp/scada-client", + "revision": "6e896784f66f82cdc6f17e00052db91699dc277d", + "revisionTime": "2016-06-01T22:40:23Z" + }, + { + "checksumSHA1": "fv3nX1vDZViW0tA7Aa5Va2lBUtM=", + "path": "github.com/hashicorp/scada-client/scada", + "revision": "6e896784f66f82cdc6f17e00052db91699dc277d", + "revisionTime": "2016-06-01T22:40:23Z" + }, + { + "checksumSHA1": "E3Xcanc9ouQwL+CZGOUyA/+giLg=", + "comment": "v0.7.0-66-g6c4672d", + "path": "github.com/hashicorp/serf/coordinate", + "revision": "114430d8210835d66defdc31cdc176c58e060005", + "revisionTime": "2016-08-09T01:42:04Z" + }, + { + "checksumSHA1": "vLyudzMEdik8IpRY1H2vRa2PeLU=", + "comment": "v0.7.0-66-g6c4672d", + "path": "github.com/hashicorp/serf/serf", + "revision": "114430d8210835d66defdc31cdc176c58e060005", + "revisionTime": "2016-08-09T01:42:04Z" + }, + { + "checksumSHA1": "ZhK6IO2XN81Y+3RAjTcVm1Ic7oU=", + "path": "github.com/hashicorp/yamux", + "revision": "d1caa6c97c9fc1cc9e83bbe34d0603f9ff0ce8bd", + "revisionTime": "2016-07-20T23:31:40Z" + }, + { + "checksumSHA1": "xZuhljnmBysJPta/lMyYmJdujCg=", + "path": "github.com/mattn/go-isatty", + "revision": "66b8e73f3f5cda9f96b69efd03dd3d7fc4a5cdb8", + "revisionTime": "2016-08-06T12:27:52Z" + }, + { + "checksumSHA1": "OUZ1FFXyKs+Cfg9M9rmXqqweQck=", + "path": "github.com/miekg/dns", + "revision": "db96a2b759cdef4f11a34506a42eb8d1290c598e", + "revisionTime": "2016-07-26T03:20:27Z" + }, + { + "checksumSHA1": "yF39M9MGatDbq2d2oqlLy44jsRc=", + "path": "github.com/mitchellh/cli", + "revision": "168daae10d6ff81b8b1201b0a4c9607d7e9b82e3", + "revisionTime": "2016-03-23T17:07:00Z" + }, + { + "checksumSHA1": "86nE93o1VIND0Doe8PuhCXnhUx0=", + "path": "github.com/mitchellh/copystructure", + "revision": "cdac8253d00f2ecf0a0b19fbff173a9a72de4f82", + "revisionTime": "2016-08-04T03:23:30Z" + }, + { + "checksumSHA1": "LUrnGREfnifW4WDMaavmc9MlLI0=", + "path": "github.com/mitchellh/mapstructure", + "revision": "ca63d7c062ee3c9f34db231e352b60012b4fd0c1", + "revisionTime": "2016-08-08T18:12:53Z" + }, + { + "checksumSHA1": "mrqMlK6gqe//WsJSrJ1HgkPM0lM=", + "path": "github.com/mitchellh/reflectwalk", + "revision": "eecf4c70c626c7cfbb95c90195bc34d386c74ac6", + "revisionTime": "2015-05-27T15:31:53Z" + }, + { + "checksumSHA1": "3AoPMXlmVq2+iWMpsdJZkcUKHB8=", + "path": "github.com/opencontainers/runc/libcontainer/user", + "revision": "0f764571384a3ff16c6fed25ace5b7c83f0f0379", + "revisionTime": "2016-08-09T12:22:04Z" + }, + { + "checksumSHA1": "ExnVEVNT8APpFTm26cUb5T09yR4=", + "comment": "v2.0.1-8-g983d3a5", + "path": "github.com/ryanuber/columnize", + "revision": "9b3edd62028f107d7cabb19353292afd29311a4e", + "revisionTime": "2016-07-12T16:32:29Z" + }, + { + "checksumSHA1": "9jjO5GjLa0XF/nfWihF02RoH4qc=", + "path": "golang.org/x/net/context", + "revision": "075e191f18186a8ff2becaf64478e30f4545cdad", + "revisionTime": "2016-08-05T06:12:51Z" + }, + { + "checksumSHA1": "WHc3uByvGaMcnSoI21fhzYgbOgg=", + "path": "golang.org/x/net/context/ctxhttp", + "revision": "075e191f18186a8ff2becaf64478e30f4545cdad", + "revisionTime": "2016-08-05T06:12:51Z" + }, + { + "checksumSHA1": "15doxxBfOxOhWExkxjPNo6Y7fEw=", + "path": "golang.org/x/sys/unix", + "revision": "20457ee8ea8546920d3f4e19e405da45250dc5a5", + "revisionTime": "2016-01-20T16:03:39Z" + } + ], + "rootPath": "github.com/hashicorp/consul" +} diff --git a/vendor/github.com/hashicorp/golang-lru/2q.go b/vendor/github.com/hashicorp/golang-lru/2q.go new file mode 100644 index 00000000000..337d963296c --- /dev/null +++ b/vendor/github.com/hashicorp/golang-lru/2q.go @@ -0,0 +1,212 @@ +package lru + +import ( + "fmt" + "sync" + + "github.com/hashicorp/golang-lru/simplelru" +) + +const ( + // Default2QRecentRatio is the ratio of the 2Q cache dedicated + // to recently added entries that have only been accessed once. + Default2QRecentRatio = 0.25 + + // Default2QGhostEntries is the default ratio of ghost + // entries kept to track entries recently evicted + Default2QGhostEntries = 0.50 +) + +// TwoQueueCache is a thread-safe fixed size 2Q cache. +// 2Q is an enhancement over the standard LRU cache +// in that it tracks both frequently and recently used +// entries separately. This avoids a burst in access to new +// entries from evicting frequently used entries. It adds some +// additional tracking overhead to the standard LRU cache, and is +// computationally about 2x the cost, and adds some metadata over +// head. The ARCCache is similar, but does not require setting any +// parameters. +type TwoQueueCache struct { + size int + recentSize int + + recent *simplelru.LRU + frequent *simplelru.LRU + recentEvict *simplelru.LRU + lock sync.RWMutex +} + +// New2Q creates a new TwoQueueCache using the default +// values for the parameters. +func New2Q(size int) (*TwoQueueCache, error) { + return New2QParams(size, Default2QRecentRatio, Default2QGhostEntries) +} + +// New2QParams creates a new TwoQueueCache using the provided +// parameter values. +func New2QParams(size int, recentRatio float64, ghostRatio float64) (*TwoQueueCache, error) { + if size <= 0 { + return nil, fmt.Errorf("invalid size") + } + if recentRatio < 0.0 || recentRatio > 1.0 { + return nil, fmt.Errorf("invalid recent ratio") + } + if ghostRatio < 0.0 || ghostRatio > 1.0 { + return nil, fmt.Errorf("invalid ghost ratio") + } + + // Determine the sub-sizes + recentSize := int(float64(size) * recentRatio) + evictSize := int(float64(size) * ghostRatio) + + // Allocate the LRUs + recent, err := simplelru.NewLRU(size, nil) + if err != nil { + return nil, err + } + frequent, err := simplelru.NewLRU(size, nil) + if err != nil { + return nil, err + } + recentEvict, err := simplelru.NewLRU(evictSize, nil) + if err != nil { + return nil, err + } + + // Initialize the cache + c := &TwoQueueCache{ + size: size, + recentSize: recentSize, + recent: recent, + frequent: frequent, + recentEvict: recentEvict, + } + return c, nil +} + +func (c *TwoQueueCache) Get(key interface{}) (interface{}, bool) { + c.lock.Lock() + defer c.lock.Unlock() + + // Check if this is a frequent value + if val, ok := c.frequent.Get(key); ok { + return val, ok + } + + // If the value is contained in recent, then we + // promote it to frequent + if val, ok := c.recent.Peek(key); ok { + c.recent.Remove(key) + c.frequent.Add(key, val) + return val, ok + } + + // No hit + return nil, false +} + +func (c *TwoQueueCache) Add(key, value interface{}) { + c.lock.Lock() + defer c.lock.Unlock() + + // Check if the value is frequently used already, + // and just update the value + if c.frequent.Contains(key) { + c.frequent.Add(key, value) + return + } + + // Check if the value is recently used, and promote + // the value into the frequent list + if c.recent.Contains(key) { + c.recent.Remove(key) + c.frequent.Add(key, value) + return + } + + // If the value was recently evicted, add it to the + // frequently used list + if c.recentEvict.Contains(key) { + c.ensureSpace(true) + c.recentEvict.Remove(key) + c.frequent.Add(key, value) + return + } + + // Add to the recently seen list + c.ensureSpace(false) + c.recent.Add(key, value) + return +} + +// ensureSpace is used to ensure we have space in the cache +func (c *TwoQueueCache) ensureSpace(recentEvict bool) { + // If we have space, nothing to do + recentLen := c.recent.Len() + freqLen := c.frequent.Len() + if recentLen+freqLen < c.size { + return + } + + // If the recent buffer is larger than + // the target, evict from there + if recentLen > 0 && (recentLen > c.recentSize || (recentLen == c.recentSize && !recentEvict)) { + k, _, _ := c.recent.RemoveOldest() + c.recentEvict.Add(k, nil) + return + } + + // Remove from the frequent list otherwise + c.frequent.RemoveOldest() +} + +func (c *TwoQueueCache) Len() int { + c.lock.RLock() + defer c.lock.RUnlock() + return c.recent.Len() + c.frequent.Len() +} + +func (c *TwoQueueCache) Keys() []interface{} { + c.lock.RLock() + defer c.lock.RUnlock() + k1 := c.frequent.Keys() + k2 := c.recent.Keys() + return append(k1, k2...) +} + +func (c *TwoQueueCache) Remove(key interface{}) { + c.lock.Lock() + defer c.lock.Unlock() + if c.frequent.Remove(key) { + return + } + if c.recent.Remove(key) { + return + } + if c.recentEvict.Remove(key) { + return + } +} + +func (c *TwoQueueCache) Purge() { + c.lock.Lock() + defer c.lock.Unlock() + c.recent.Purge() + c.frequent.Purge() + c.recentEvict.Purge() +} + +func (c *TwoQueueCache) Contains(key interface{}) bool { + c.lock.RLock() + defer c.lock.RUnlock() + return c.frequent.Contains(key) || c.recent.Contains(key) +} + +func (c *TwoQueueCache) Peek(key interface{}) (interface{}, bool) { + c.lock.RLock() + defer c.lock.RUnlock() + if val, ok := c.frequent.Peek(key); ok { + return val, ok + } + return c.recent.Peek(key) +} diff --git a/vendor/github.com/hashicorp/golang-lru/README.md b/vendor/github.com/hashicorp/golang-lru/README.md new file mode 100644 index 00000000000..33e58cfaf97 --- /dev/null +++ b/vendor/github.com/hashicorp/golang-lru/README.md @@ -0,0 +1,25 @@ +golang-lru +========== + +This provides the `lru` package which implements a fixed-size +thread safe LRU cache. It is based on the cache in Groupcache. + +Documentation +============= + +Full docs are available on [Godoc](http://godoc.org/github.com/hashicorp/golang-lru) + +Example +======= + +Using the LRU is very simple: + +```go +l, _ := New(128) +for i := 0; i < 256; i++ { + l.Add(i, nil) +} +if l.Len() != 128 { + panic(fmt.Sprintf("bad len: %v", l.Len())) +} +``` diff --git a/vendor/github.com/hashicorp/golang-lru/arc.go b/vendor/github.com/hashicorp/golang-lru/arc.go new file mode 100644 index 00000000000..a2a25281733 --- /dev/null +++ b/vendor/github.com/hashicorp/golang-lru/arc.go @@ -0,0 +1,257 @@ +package lru + +import ( + "sync" + + "github.com/hashicorp/golang-lru/simplelru" +) + +// ARCCache is a thread-safe fixed size Adaptive Replacement Cache (ARC). +// ARC is an enhancement over the standard LRU cache in that tracks both +// frequency and recency of use. This avoids a burst in access to new +// entries from evicting the frequently used older entries. It adds some +// additional tracking overhead to a standard LRU cache, computationally +// it is roughly 2x the cost, and the extra memory overhead is linear +// with the size of the cache. ARC has been patented by IBM, but is +// similar to the TwoQueueCache (2Q) which requires setting parameters. +type ARCCache struct { + size int // Size is the total capacity of the cache + p int // P is the dynamic preference towards T1 or T2 + + t1 *simplelru.LRU // T1 is the LRU for recently accessed items + b1 *simplelru.LRU // B1 is the LRU for evictions from t1 + + t2 *simplelru.LRU // T2 is the LRU for frequently accessed items + b2 *simplelru.LRU // B2 is the LRU for evictions from t2 + + lock sync.RWMutex +} + +// NewARC creates an ARC of the given size +func NewARC(size int) (*ARCCache, error) { + // Create the sub LRUs + b1, err := simplelru.NewLRU(size, nil) + if err != nil { + return nil, err + } + b2, err := simplelru.NewLRU(size, nil) + if err != nil { + return nil, err + } + t1, err := simplelru.NewLRU(size, nil) + if err != nil { + return nil, err + } + t2, err := simplelru.NewLRU(size, nil) + if err != nil { + return nil, err + } + + // Initialize the ARC + c := &ARCCache{ + size: size, + p: 0, + t1: t1, + b1: b1, + t2: t2, + b2: b2, + } + return c, nil +} + +// Get looks up a key's value from the cache. +func (c *ARCCache) Get(key interface{}) (interface{}, bool) { + c.lock.Lock() + defer c.lock.Unlock() + + // Ff the value is contained in T1 (recent), then + // promote it to T2 (frequent) + if val, ok := c.t1.Peek(key); ok { + c.t1.Remove(key) + c.t2.Add(key, val) + return val, ok + } + + // Check if the value is contained in T2 (frequent) + if val, ok := c.t2.Get(key); ok { + return val, ok + } + + // No hit + return nil, false +} + +// Add adds a value to the cache. +func (c *ARCCache) Add(key, value interface{}) { + c.lock.Lock() + defer c.lock.Unlock() + + // Check if the value is contained in T1 (recent), and potentially + // promote it to frequent T2 + if c.t1.Contains(key) { + c.t1.Remove(key) + c.t2.Add(key, value) + return + } + + // Check if the value is already in T2 (frequent) and update it + if c.t2.Contains(key) { + c.t2.Add(key, value) + return + } + + // Check if this value was recently evicted as part of the + // recently used list + if c.b1.Contains(key) { + // T1 set is too small, increase P appropriately + delta := 1 + b1Len := c.b1.Len() + b2Len := c.b2.Len() + if b2Len > b1Len { + delta = b2Len / b1Len + } + if c.p+delta >= c.size { + c.p = c.size + } else { + c.p += delta + } + + // Potentially need to make room in the cache + if c.t1.Len()+c.t2.Len() >= c.size { + c.replace(false) + } + + // Remove from B1 + c.b1.Remove(key) + + // Add the key to the frequently used list + c.t2.Add(key, value) + return + } + + // Check if this value was recently evicted as part of the + // frequently used list + if c.b2.Contains(key) { + // T2 set is too small, decrease P appropriately + delta := 1 + b1Len := c.b1.Len() + b2Len := c.b2.Len() + if b1Len > b2Len { + delta = b1Len / b2Len + } + if delta >= c.p { + c.p = 0 + } else { + c.p -= delta + } + + // Potentially need to make room in the cache + if c.t1.Len()+c.t2.Len() >= c.size { + c.replace(true) + } + + // Remove from B2 + c.b2.Remove(key) + + // Add the key to the frequntly used list + c.t2.Add(key, value) + return + } + + // Potentially need to make room in the cache + if c.t1.Len()+c.t2.Len() >= c.size { + c.replace(false) + } + + // Keep the size of the ghost buffers trim + if c.b1.Len() > c.size-c.p { + c.b1.RemoveOldest() + } + if c.b2.Len() > c.p { + c.b2.RemoveOldest() + } + + // Add to the recently seen list + c.t1.Add(key, value) + return +} + +// replace is used to adaptively evict from either T1 or T2 +// based on the current learned value of P +func (c *ARCCache) replace(b2ContainsKey bool) { + t1Len := c.t1.Len() + if t1Len > 0 && (t1Len > c.p || (t1Len == c.p && b2ContainsKey)) { + k, _, ok := c.t1.RemoveOldest() + if ok { + c.b1.Add(k, nil) + } + } else { + k, _, ok := c.t2.RemoveOldest() + if ok { + c.b2.Add(k, nil) + } + } +} + +// Len returns the number of cached entries +func (c *ARCCache) Len() int { + c.lock.RLock() + defer c.lock.RUnlock() + return c.t1.Len() + c.t2.Len() +} + +// Keys returns all the cached keys +func (c *ARCCache) Keys() []interface{} { + c.lock.RLock() + defer c.lock.RUnlock() + k1 := c.t1.Keys() + k2 := c.t2.Keys() + return append(k1, k2...) +} + +// Remove is used to purge a key from the cache +func (c *ARCCache) Remove(key interface{}) { + c.lock.Lock() + defer c.lock.Unlock() + if c.t1.Remove(key) { + return + } + if c.t2.Remove(key) { + return + } + if c.b1.Remove(key) { + return + } + if c.b2.Remove(key) { + return + } +} + +// Purge is used to clear the cache +func (c *ARCCache) Purge() { + c.lock.Lock() + defer c.lock.Unlock() + c.t1.Purge() + c.t2.Purge() + c.b1.Purge() + c.b2.Purge() +} + +// Contains is used to check if the cache contains a key +// without updating recency or frequency. +func (c *ARCCache) Contains(key interface{}) bool { + c.lock.RLock() + defer c.lock.RUnlock() + return c.t1.Contains(key) || c.t2.Contains(key) +} + +// Peek is used to inspect the cache value of a key +// without updating recency or frequency. +func (c *ARCCache) Peek(key interface{}) (interface{}, bool) { + c.lock.RLock() + defer c.lock.RUnlock() + if val, ok := c.t1.Peek(key); ok { + return val, ok + } + return c.t2.Peek(key) +} diff --git a/vendor/github.com/hashicorp/golang-lru/lru.go b/vendor/github.com/hashicorp/golang-lru/lru.go new file mode 100644 index 00000000000..a6285f989e0 --- /dev/null +++ b/vendor/github.com/hashicorp/golang-lru/lru.go @@ -0,0 +1,114 @@ +// This package provides a simple LRU cache. It is based on the +// LRU implementation in groupcache: +// https://github.com/golang/groupcache/tree/master/lru +package lru + +import ( + "sync" + + "github.com/hashicorp/golang-lru/simplelru" +) + +// Cache is a thread-safe fixed size LRU cache. +type Cache struct { + lru *simplelru.LRU + lock sync.RWMutex +} + +// New creates an LRU of the given size +func New(size int) (*Cache, error) { + return NewWithEvict(size, nil) +} + +// NewWithEvict constructs a fixed size cache with the given eviction +// callback. +func NewWithEvict(size int, onEvicted func(key interface{}, value interface{})) (*Cache, error) { + lru, err := simplelru.NewLRU(size, simplelru.EvictCallback(onEvicted)) + if err != nil { + return nil, err + } + c := &Cache{ + lru: lru, + } + return c, nil +} + +// Purge is used to completely clear the cache +func (c *Cache) Purge() { + c.lock.Lock() + c.lru.Purge() + c.lock.Unlock() +} + +// Add adds a value to the cache. Returns true if an eviction occurred. +func (c *Cache) Add(key, value interface{}) bool { + c.lock.Lock() + defer c.lock.Unlock() + return c.lru.Add(key, value) +} + +// Get looks up a key's value from the cache. +func (c *Cache) Get(key interface{}) (interface{}, bool) { + c.lock.Lock() + defer c.lock.Unlock() + return c.lru.Get(key) +} + +// Check if a key is in the cache, without updating the recent-ness +// or deleting it for being stale. +func (c *Cache) Contains(key interface{}) bool { + c.lock.RLock() + defer c.lock.RUnlock() + return c.lru.Contains(key) +} + +// Returns the key value (or undefined if not found) without updating +// the "recently used"-ness of the key. +func (c *Cache) Peek(key interface{}) (interface{}, bool) { + c.lock.RLock() + defer c.lock.RUnlock() + return c.lru.Peek(key) +} + +// ContainsOrAdd checks if a key is in the cache without updating the +// recent-ness or deleting it for being stale, and if not, adds the value. +// Returns whether found and whether an eviction occurred. +func (c *Cache) ContainsOrAdd(key, value interface{}) (ok, evict bool) { + c.lock.Lock() + defer c.lock.Unlock() + + if c.lru.Contains(key) { + return true, false + } else { + evict := c.lru.Add(key, value) + return false, evict + } +} + +// Remove removes the provided key from the cache. +func (c *Cache) Remove(key interface{}) { + c.lock.Lock() + c.lru.Remove(key) + c.lock.Unlock() +} + +// RemoveOldest removes the oldest item from the cache. +func (c *Cache) RemoveOldest() { + c.lock.Lock() + c.lru.RemoveOldest() + c.lock.Unlock() +} + +// Keys returns a slice of the keys in the cache, from oldest to newest. +func (c *Cache) Keys() []interface{} { + c.lock.RLock() + defer c.lock.RUnlock() + return c.lru.Keys() +} + +// Len returns the number of items in the cache. +func (c *Cache) Len() int { + c.lock.RLock() + defer c.lock.RUnlock() + return c.lru.Len() +} diff --git a/vendor/vendor.json b/vendor/vendor.json index adb47570d99..c75baaaddb3 100644 --- a/vendor/vendor.json +++ b/vendor/vendor.json @@ -525,14 +525,47 @@ "revisionTime": "2016-10-03T19:46:06Z" }, { + "checksumSHA1": "kWbL0V4o8vJL75mzeQzhF6p5jiQ=", + "path": "github.com/hashicorp/consul/acl", + "revision": "a189091a3530051285c12c726ca28ea55e015336", + "revisionTime": "2016-09-14T16:11:34Z" + }, + { + "checksumSHA1": "BMEJLBjl91k5k3vMMzzT7G2SO1U=", "comment": "v0.6.3-363-gae32a3c", "path": "github.com/hashicorp/consul/api", - "revision": "ae32a3ceae9fddb431b933ed7b2a82110e41e1bf" + "revision": "a189091a3530051285c12c726ca28ea55e015336", + "revisionTime": "2016-09-14T16:11:34Z" + }, + { + "checksumSHA1": "NrK9uDGSZ2WKMNLYicxDYmpRS3I=", + "path": "github.com/hashicorp/consul/consul/structs", + "revision": "a189091a3530051285c12c726ca28ea55e015336", + "revisionTime": "2016-09-14T16:11:34Z" + }, + { + "checksumSHA1": "0DPAA2cTBjrCGgXaxXil0vILcFs=", + "path": "github.com/hashicorp/consul/lib", + "revision": "a189091a3530051285c12c726ca28ea55e015336", + "revisionTime": "2016-09-14T16:11:34Z" + }, + { + "checksumSHA1": "BuOGVqQNO+iuqoGKAGGxCxC51ro=", + "path": "github.com/hashicorp/consul/testutil", + "revision": "a189091a3530051285c12c726ca28ea55e015336", + "revisionTime": "2016-09-14T16:11:34Z" }, { "comment": "v0.6.3-363-gae32a3c", "path": "github.com/hashicorp/consul/tlsutil", - "revision": "ae32a3ceae9fddb431b933ed7b2a82110e41e1bf" + "revision": "a189091a3530051285c12c726ca28ea55e015336", + "revisionTime": "2016-09-14T16:11:34Z" + }, + { + "checksumSHA1": "oQBVnohHMtF/4HuDCYKkhy+ijZ4=", + "path": "github.com/hashicorp/consul/types", + "revision": "a189091a3530051285c12c726ca28ea55e015336", + "revisionTime": "2016-09-14T16:11:34Z" }, { "path": "github.com/hashicorp/errwrap", @@ -600,6 +633,12 @@ "path": "github.com/hashicorp/go-version", "revision": "2e7f5ea8e27bb3fdf9baa0881d16757ac4637332" }, + { + "checksumSHA1": "d9PxF1XQGLMJZRct2R8qVM/eYlE=", + "path": "github.com/hashicorp/golang-lru", + "revision": "a0d98a5f288019575c6d1f4bb1573fef2d1fcdc4", + "revisionTime": "2016-02-07T21:47:19Z" + }, { "path": "github.com/hashicorp/golang-lru/simplelru", "revision": "a0d98a5f288019575c6d1f4bb1573fef2d1fcdc4"