Skip to content

Commit

Permalink
Allow GKE node pool to resume create after interruption (#4501) (#8492)
Browse files Browse the repository at this point in the history
Signed-off-by: Modular Magician <[email protected]>
  • Loading branch information
modular-magician authored Feb 17, 2021
1 parent 6a17c84 commit 479f25f
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .changelog/4501.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:enhancement
compute: Added graceful termination to `google_container_node_pool` create calls so that partially created node pools will resume the original operation if the Terraform process is killed mid create.
```
33 changes: 33 additions & 0 deletions google/resource_container_node_pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ func resourceContainerNodePool() *schema.Resource {
ForceNew: true,
Description: `The location (region or zone) of the cluster.`,
},
"operation": {
Type: schema.TypeString,
Computed: true,
},
}),
}
}
Expand Down Expand Up @@ -315,6 +319,20 @@ func resourceContainerNodePoolCreate(d *schema.ResourceData, meta interface{}) e
nodePoolInfo.location, "creating GKE NodePool", userAgent, timeout)

if waitErr != nil {
// Check if the create operation failed because Terraform was prematurely terminated. If it was we can persist the
// operation id to state so that a subsequent refresh of this resource will wait until the operation has terminated
// before attempting to Read the state of the cluster. This allows a graceful resumption of a Create that was killed
// by the upstream Terraform process exiting early such as a sigterm.
select {
case <-config.context.Done():
log.Printf("[DEBUG] Persisting %s so this operation can be resumed \n", operation.Name)
if err := d.Set("operation", operation.Name); err != nil {
return fmt.Errorf("Error setting operation: %s", err)
}
return nil
default:
// leaving default case to ensure this is non blocking
}
// The resource didn't actually create
d.SetId("")
return waitErr
Expand Down Expand Up @@ -356,6 +374,21 @@ func resourceContainerNodePoolRead(d *schema.ResourceData, meta interface{}) err
return err
}

operation := d.Get("operation").(string)
if operation != "" {
log.Printf("[DEBUG] in progress operation detected at %v, attempting to resume", operation)
op := &containerBeta.Operation{
Name: operation,
}
if err := d.Set("operation", ""); err != nil {
return fmt.Errorf("Error setting operation: %s", err)
}
waitErr := containerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, "resuming GKE node pool", userAgent, d.Timeout(schema.TimeoutRead))
if waitErr != nil {
return waitErr
}
}

name := getNodePoolName(d.Id())

clusterNodePoolsGetCall := config.NewContainerBetaClient(userAgent).Projects.Locations.Clusters.NodePools.Get(nodePoolInfo.fullyQualifiedName(name))
Expand Down

0 comments on commit 479f25f

Please sign in to comment.