Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix flakes with rpc integration test #1860

Merged
merged 5 commits into from
Apr 1, 2019
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 81 additions & 27 deletions integration/rpc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,10 @@ import (
)

var (
retries = 20
numLogEntries = 5
waitTime = 1 * time.Second
connectionRetries = 20
stateRetries = 5
numLogEntries = 5
waitTime = 1 * time.Second
)

func TestEventLogRPC(t *testing.T) {
Expand All @@ -61,7 +62,7 @@ func TestEventLogRPC(t *testing.T) {
if err != nil {
t.Logf("unable to establish skaffold grpc connection: retrying...")
attempts++
if attempts == retries {
if attempts == connectionRetries {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need to re-try here? Is it because, the --rpc-port which the we ended up picking is not free any because some other test or CI job grabbed it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this would happen if skaffold hadn't started up the RPC server yet. it's pretty unlikely that we would ever actually need to retry here, I had this in the original client I wrote so I figured I would throw it in here for robustness.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The number of connection re-tries is 20 which scares me. If there is a real issue, it would be masked. I am ok with a smaller number of re-tries here like 2.
Another idea: we should log and collect metrics when retries happen. Can help us determine which skaffold code should we stabilize.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I created one here #1880

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

HAHA yeah that does not need to be 20....I think I meant to set it to 2 originally. I'll change it here, and thanks for opening that issue

t.Fatalf("error establishing skaffold grpc connection")
}

Expand All @@ -84,7 +85,7 @@ func TestEventLogRPC(t *testing.T) {
if err == nil {
break
}
if attempts < retries {
if attempts < connectionRetries {
attempts++
t.Logf("waiting for connection...")
time.Sleep(3 * time.Second)
Expand Down Expand Up @@ -213,7 +214,7 @@ func TestGetStateRPC(t *testing.T) {
if err != nil {
t.Logf("unable to establish skaffold grpc connection: retrying...")
attempts++
if attempts == retries {
if attempts == connectionRetries {
t.Fatalf("error establishing skaffold grpc connection")
}

Expand All @@ -229,38 +230,58 @@ func TestGetStateRPC(t *testing.T) {
ctx, ctxCancel := context.WithCancel(context.Background())
defer ctxCancel()

// retrieve the state and make sure everything looks correct
var grpcState *proto.State
// try a few times and wait around until we see the build is complete, or fail.
attempts = 0
outerLoop:
nkubala marked this conversation as resolved.
Show resolved Hide resolved
for {
grpcState := retrieveRPCState(ctx, t, client)
if attempts == stateRetries {
// fail condition has been reached: abbreviated way of
// only logging the artifacts that didn't reach completion.
for _, v := range grpcState.BuildState.Artifacts {
testutil.CheckDeepEqual(t, event.Complete, v)
}
testutil.CheckDeepEqual(t, event.Complete, grpcState.DeployState.Status)
break
}
for _, v := range grpcState.BuildState.Artifacts {
if v != event.Complete {
time.Sleep(500 * time.Millisecond)
attempts++
continue outerLoop
}
}
if grpcState.DeployState.Status != event.Complete {
time.Sleep(500 * time.Millisecond)
attempts++
continue outerLoop
}
break
}
}

func retrieveRPCState(ctx context.Context, t *testing.T, client proto.SkaffoldServiceClient) *proto.State {
var grpcState *proto.State
var err error
attempts := 0
for {
nkubala marked this conversation as resolved.
Show resolved Hide resolved
grpcState, err = client.GetState(ctx, &empty.Empty{})
if err == nil {
break
}
if attempts < retries {
if attempts < connectionRetries {
attempts++
t.Logf("waiting for connection...")
time.Sleep(3 * time.Second)
continue
}
t.Fatalf("error retrieving state: %v\n", err)
}

for _, v := range grpcState.BuildState.Artifacts {
testutil.CheckDeepEqual(t, event.Complete, v)
}
testutil.CheckDeepEqual(t, event.Complete, grpcState.DeployState.Status)
return grpcState
}

func TestGetStateHTTP(t *testing.T) {
if testing.Short() {
t.Skip("skipping integration test")
}

httpAddr := randomPort()
teardown := setupSkaffoldWithArgs(t, "--rpc-http-port", httpAddr)
defer teardown()
time.Sleep(3 * time.Second) // give skaffold time to process all events
func retrieveHTTPState(t *testing.T, httpAddr string) proto.State {
var httpState proto.State

// retrieve the state via HTTP as well, and verify the result is the same
httpResponse, err := http.Get(fmt.Sprintf("http://localhost:%s/v1/state", httpAddr))
Expand All @@ -271,14 +292,47 @@ func TestGetStateHTTP(t *testing.T) {
if err != nil {
t.Errorf("error reading body from http response: %s", err.Error())
}
var httpState proto.State
if err := json.Unmarshal(b, &httpState); err != nil {
t.Errorf("error converting http response to proto: %s", err.Error())
}
for _, v := range httpState.BuildState.Artifacts {
testutil.CheckDeepEqual(t, event.Complete, v)
return httpState
}

func TestGetStateHTTP(t *testing.T) {
if testing.Short() {
t.Skip("skipping integration test")
}

httpAddr := randomPort()
teardown := setupSkaffoldWithArgs(t, "--rpc-http-port", httpAddr)
defer teardown()
time.Sleep(3 * time.Second) // give skaffold time to process all events

attempts := 0
outerLoop:
for {
httpState := retrieveHTTPState(t, httpAddr)
if attempts == stateRetries {
for _, v := range httpState.BuildState.Artifacts {
testutil.CheckDeepEqual(t, event.Complete, v)
}
testutil.CheckDeepEqual(t, event.Complete, httpState.DeployState.Status)
break
}
for _, v := range httpState.BuildState.Artifacts {
nkubala marked this conversation as resolved.
Show resolved Hide resolved
if v != event.Complete {
time.Sleep(1 * time.Second)
attempts++
continue outerLoop
}
}
if httpState.DeployState.Status != event.Complete {
time.Sleep(1 * time.Second)
attempts++
continue outerLoop
}
break
}
testutil.CheckDeepEqual(t, event.Complete, httpState.DeployState.Status)
}

func setupSkaffoldWithArgs(t *testing.T, args ...string) func() {
Expand Down