Skip to content

Commit

Permalink
Merge pull request #14746 from hashicorp/jbardin/s3-consistency
Browse files Browse the repository at this point in the history
store and verify s3 remote state checksum to avoid consistency issues.
  • Loading branch information
jbardin authored May 24, 2017
2 parents cf1ef16 + 91be40a commit ef1d539
Show file tree
Hide file tree
Showing 2 changed files with 314 additions and 5 deletions.
168 changes: 163 additions & 5 deletions backend/remote-state/s3/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@ package s3

import (
"bytes"
"crypto/md5"
"encoding/hex"
"encoding/json"
"errors"
"fmt"
"io"
"log"
"time"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/awserr"
Expand All @@ -17,6 +21,9 @@ import (
"github.com/hashicorp/terraform/state/remote"
)

// Store the last saved serial in dynamo with this suffix for consistency checks.
const stateIDSuffix = "-md5"

type RemoteClient struct {
s3Client *s3.S3
dynClient *dynamodb.DynamoDB
Expand All @@ -28,7 +35,55 @@ type RemoteClient struct {
lockTable string
}

func (c *RemoteClient) Get() (*remote.Payload, error) {
var (
// The amount of time we will retry a state waiting for it to match the
// expected checksum.
consistencyRetryTimeout = 10 * time.Second

// delay when polling the state
consistencyRetryPollInterval = 2 * time.Second
)

// test hook called when checksums don't match
var testChecksumHook func()

func (c *RemoteClient) Get() (payload *remote.Payload, err error) {
deadline := time.Now().Add(consistencyRetryTimeout)

// If we have a checksum, and the returned payload doesn't match, we retry
// up until deadline.
for {
payload, err = c.get()
if err != nil {
return nil, err
}

// verify that this state is what we expect
if expected, err := c.getMD5(); err != nil {
log.Printf("[WARNING] failed to fetch state md5: %s", err)
} else if len(expected) > 0 && !bytes.Equal(expected, payload.MD5) {
log.Printf("[WARNING] state md5 mismatch: expected '%x', got '%x'", expected, payload.MD5)

if testChecksumHook != nil {
testChecksumHook()
}

if time.Now().Before(deadline) {
time.Sleep(consistencyRetryPollInterval)
log.Println("[INFO] retrying S3 RemoteClient.Get...")
continue
}

return nil, fmt.Errorf(errBadChecksumFmt, payload.MD5)
}

break
}

return payload, err
}

func (c *RemoteClient) get() (*remote.Payload, error) {
output, err := c.s3Client.GetObject(&s3.GetObjectInput{
Bucket: &c.bucketName,
Key: &c.path,
Expand All @@ -53,8 +108,10 @@ func (c *RemoteClient) Get() (*remote.Payload, error) {
return nil, fmt.Errorf("Failed to read remote state: %s", err)
}

sum := md5.Sum(buf.Bytes())
payload := &remote.Payload{
Data: buf.Bytes(),
MD5: sum[:],
}

// If there was no data, then return nil
Expand Down Expand Up @@ -92,11 +149,20 @@ func (c *RemoteClient) Put(data []byte) error {

log.Printf("[DEBUG] Uploading remote state to S3: %#v", i)

if _, err := c.s3Client.PutObject(i); err == nil {
return nil
} else {
_, err := c.s3Client.PutObject(i)
if err != nil {
return fmt.Errorf("Failed to upload state: %v", err)
}

sum := md5.Sum(data)
if err := c.putMD5(sum[:]); err != nil {
// if this errors out, we unfortunately have to error out altogether,
// since the next Get will inevitably fail.
return fmt.Errorf("failed to store state MD5: %s", err)

}

return nil
}

func (c *RemoteClient) Delete() error {
Expand All @@ -105,7 +171,15 @@ func (c *RemoteClient) Delete() error {
Key: &c.path,
})

return err
if err != nil {
return err
}

if err := c.deleteMD5(); err != nil {
log.Printf("error deleting state md5: %s", err)
}

return nil
}

func (c *RemoteClient) Lock(info *state.LockInfo) (string, error) {
Expand Down Expand Up @@ -146,9 +220,84 @@ func (c *RemoteClient) Lock(info *state.LockInfo) (string, error) {
}
return "", lockErr
}

return info.ID, nil
}

func (c *RemoteClient) getMD5() ([]byte, error) {
if c.lockTable == "" {
return nil, nil
}

getParams := &dynamodb.GetItemInput{
Key: map[string]*dynamodb.AttributeValue{
"LockID": {S: aws.String(c.lockPath() + stateIDSuffix)},
},
ProjectionExpression: aws.String("LockID, Digest"),
TableName: aws.String(c.lockTable),
}

resp, err := c.dynClient.GetItem(getParams)
if err != nil {
return nil, err
}

var val string
if v, ok := resp.Item["Digest"]; ok && v.S != nil {
val = *v.S
}

sum, err := hex.DecodeString(val)
if err != nil || len(sum) != md5.Size {
return nil, errors.New("invalid md5")
}

return sum, nil
}

// store the hash of the state to that clients can check for stale state files.
func (c *RemoteClient) putMD5(sum []byte) error {
if c.lockTable == "" {
return nil
}

if len(sum) != md5.Size {
return errors.New("invalid payload md5")
}

putParams := &dynamodb.PutItemInput{
Item: map[string]*dynamodb.AttributeValue{
"LockID": {S: aws.String(c.lockPath() + stateIDSuffix)},
"Digest": {S: aws.String(hex.EncodeToString(sum))},
},
TableName: aws.String(c.lockTable),
}
_, err := c.dynClient.PutItem(putParams)
if err != nil {
log.Printf("[WARNING] failed to record state serial in dynamodb: %s", err)
}

return nil
}

// remove the hash value for a deleted state
func (c *RemoteClient) deleteMD5() error {
if c.lockTable == "" {
return nil
}

params := &dynamodb.DeleteItemInput{
Key: map[string]*dynamodb.AttributeValue{
"LockID": {S: aws.String(c.lockPath() + stateIDSuffix)},
},
TableName: aws.String(c.lockTable),
}
if _, err := c.dynClient.DeleteItem(params); err != nil {
return err
}
return nil
}

func (c *RemoteClient) getLockInfo() (*state.LockInfo, error) {
getParams := &dynamodb.GetItemInput{
Key: map[string]*dynamodb.AttributeValue{
Expand Down Expand Up @@ -217,3 +366,12 @@ func (c *RemoteClient) Unlock(id string) error {
func (c *RemoteClient) lockPath() string {
return fmt.Sprintf("%s/%s", c.bucketName, c.path)
}

const errBadChecksumFmt = `state data in S3 does not have the expected content.
This may be caused by unusually long delays in S3 processing a previous state
update. Please wait for a minute or two and try again. If this problem
persists, and neither S3 nor DynamoDB are experiencing an outage, you may need
to manually verify the remote state and update the Digest value stored in the
DynamoDB table to the following value: %x
`
Loading

0 comments on commit ef1d539

Please sign in to comment.