Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix flakes with rpc integration test #1860

Merged
merged 5 commits into from
Apr 1, 2019
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 72 additions & 26 deletions integration/rpc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,10 @@ import (
)

var (
retries = 20
numLogEntries = 5
waitTime = 1 * time.Second
connectionRetries = 20
stateRetries = 5
numLogEntries = 5
waitTime = 1 * time.Second
)

func TestEventLogRPC(t *testing.T) {
Expand All @@ -61,7 +62,7 @@ func TestEventLogRPC(t *testing.T) {
if err != nil {
t.Logf("unable to establish skaffold grpc connection: retrying...")
attempts++
if attempts == retries {
if attempts == connectionRetries {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need to re-try here? Is it because, the --rpc-port which the we ended up picking is not free any because some other test or CI job grabbed it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this would happen if skaffold hadn't started up the RPC server yet. it's pretty unlikely that we would ever actually need to retry here, I had this in the original client I wrote so I figured I would throw it in here for robustness.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The number of connection re-tries is 20 which scares me. If there is a real issue, it would be masked. I am ok with a smaller number of re-tries here like 2.
Another idea: we should log and collect metrics when retries happen. Can help us determine which skaffold code should we stabilize.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I created one here #1880

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

HAHA yeah that does not need to be 20....I think I meant to set it to 2 originally. I'll change it here, and thanks for opening that issue

t.Fatalf("error establishing skaffold grpc connection")
}

Expand All @@ -84,7 +85,7 @@ func TestEventLogRPC(t *testing.T) {
if err == nil {
break
}
if attempts < retries {
if attempts < connectionRetries {
attempts++
t.Logf("waiting for connection...")
time.Sleep(3 * time.Second)
Expand Down Expand Up @@ -213,7 +214,7 @@ func TestGetStateRPC(t *testing.T) {
if err != nil {
t.Logf("unable to establish skaffold grpc connection: retrying...")
attempts++
if attempts == retries {
if attempts == connectionRetries {
t.Fatalf("error establishing skaffold grpc connection")
}

Expand All @@ -229,27 +230,63 @@ func TestGetStateRPC(t *testing.T) {
ctx, ctxCancel := context.WithCancel(context.Background())
defer ctxCancel()

// retrieve the state and make sure everything looks correct
var grpcState *proto.State
// try a few times and wait around until we see the build is complete, or fail.
attempts = 0
success := false
var grpcState *proto.State
for i := 0; i < stateRetries; i++ {
grpcState = retrieveRPCState(ctx, t, client)
if checkState(*grpcState) {
success = true
break
}
time.Sleep(1 * time.Second)
}
if !success {
// max attempts exceeded, log errors
for _, v := range grpcState.BuildState.Artifacts {
testutil.CheckDeepEqual(t, event.Complete, v)
}
testutil.CheckDeepEqual(t, event.Complete, grpcState.DeployState.Status)
}
}

func retrieveRPCState(ctx context.Context, t *testing.T, client proto.SkaffoldServiceClient) *proto.State {
var grpcState *proto.State
var err error
attempts := 0
for {
nkubala marked this conversation as resolved.
Show resolved Hide resolved
grpcState, err = client.GetState(ctx, &empty.Empty{})
if err == nil {
break
}
if attempts < retries {
if attempts < connectionRetries {
attempts++
t.Logf("waiting for connection...")
time.Sleep(3 * time.Second)
continue
}
t.Fatalf("error retrieving state: %v\n", err)
}
return grpcState
}

func retrieveHTTPState(t *testing.T, httpAddr string) proto.State {
var httpState proto.State

for _, v := range grpcState.BuildState.Artifacts {
testutil.CheckDeepEqual(t, event.Complete, v)
// retrieve the state via HTTP as well, and verify the result is the same
httpResponse, err := http.Get(fmt.Sprintf("http://localhost:%s/v1/state", httpAddr))
if err != nil {
t.Fatalf("error connecting to gRPC REST API: %s", err.Error())
}
b, err := ioutil.ReadAll(httpResponse.Body)
if err != nil {
t.Errorf("error reading body from http response: %s", err.Error())
}
if err := json.Unmarshal(b, &httpState); err != nil {
t.Errorf("error converting http response to proto: %s", err.Error())
}
testutil.CheckDeepEqual(t, event.Complete, grpcState.DeployState.Status)
return httpState
}

func TestGetStateHTTP(t *testing.T) {
Expand All @@ -262,23 +299,23 @@ func TestGetStateHTTP(t *testing.T) {
defer teardown()
time.Sleep(3 * time.Second) // give skaffold time to process all events

// retrieve the state via HTTP as well, and verify the result is the same
httpResponse, err := http.Get(fmt.Sprintf("http://localhost:%s/v1/state", httpAddr))
if err != nil {
t.Fatalf("error connecting to gRPC REST API: %s", err.Error())
}
b, err := ioutil.ReadAll(httpResponse.Body)
if err != nil {
t.Errorf("error reading body from http response: %s", err.Error())
}
success := false
var httpState proto.State
if err := json.Unmarshal(b, &httpState); err != nil {
t.Errorf("error converting http response to proto: %s", err.Error())
for i := 0; i < stateRetries; i++ {
httpState = retrieveHTTPState(t, httpAddr)
if checkState(httpState) {
success = true
break
}
time.Sleep(1 * time.Second)
}
for _, v := range httpState.BuildState.Artifacts {
testutil.CheckDeepEqual(t, event.Complete, v)
if !success {
// max attempts exceeded, log errors
for _, v := range httpState.BuildState.Artifacts {
nkubala marked this conversation as resolved.
Show resolved Hide resolved
testutil.CheckDeepEqual(t, event.Complete, v)
}
testutil.CheckDeepEqual(t, event.Complete, httpState.DeployState.Status)
}
testutil.CheckDeepEqual(t, event.Complete, httpState.DeployState.Status)
}

func setupSkaffoldWithArgs(t *testing.T, args ...string) func() {
Expand All @@ -302,3 +339,12 @@ func setupSkaffoldWithArgs(t *testing.T, args ...string) func() {
func randomPort() string {
return fmt.Sprintf("%d", rand.Intn(65535))
}

func checkState(state proto.State) bool {
nkubala marked this conversation as resolved.
Show resolved Hide resolved
for _, a := range state.BuildState.Artifacts {
if a != event.Complete {
return false
}
}
return state.DeployState.Status == event.Complete
}