Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RayService: zero downtime update and healthcheck HA recovery #307

Merged
merged 49 commits into from
Jun 25, 2022
Merged
Changes from 1 commit
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
ec97cbd
draft for ha
brucez-anyscale Jun 13, 2022
f0c7cc1
import fmt
brucez-anyscale Jun 13, 2022
c086d62
debug ingress
brucez-anyscale Jun 14, 2022
ee38f49
Draft service
brucez-anyscale Jun 14, 2022
d618910
update
brucez-anyscale Jun 14, 2022
22bd454
fix
brucez-anyscale Jun 14, 2022
9a2a08e
Update service logic
brucez-anyscale Jun 14, 2022
96631d1
update
brucez-anyscale Jun 14, 2022
30b5a52
update
brucez-anyscale Jun 14, 2022
035235b
Logs
brucez-anyscale Jun 14, 2022
82ba89b
update
brucez-anyscale Jun 14, 2022
bb89d09
debug
brucez-anyscale Jun 14, 2022
7bb1ee5
Update
brucez-anyscale Jun 14, 2022
9b75096
Update
brucez-anyscale Jun 14, 2022
8e069f1
Update
brucez-anyscale Jun 14, 2022
bb99bfc
update
brucez-anyscale Jun 14, 2022
d7591ad
Fix cluster start flaky issue
brucez-anyscale Jun 14, 2022
76b6e71
update
brucez-anyscale Jun 14, 2022
76fc2d6
Update service and ingress
brucez-anyscale Jun 14, 2022
837c362
update rbac
brucez-anyscale Jun 14, 2022
c7338a7
Draft v1
brucez-anyscale Jun 14, 2022
0f34a8f
Update
brucez-anyscale Jun 15, 2022
e10d923
address comments
brucez-anyscale Jun 15, 2022
db9c27a
Address comments and refactor codes
brucez-anyscale Jun 15, 2022
16a7786
update
brucez-anyscale Jun 15, 2022
505de3d
Fix lint issue
brucez-anyscale Jun 15, 2022
d10cef9
update
brucez-anyscale Jun 15, 2022
821207c
Fix unit tests
brucez-anyscale Jun 15, 2022
4a83e7e
goImport
brucez-anyscale Jun 16, 2022
7f8b91e
Update unit tests
brucez-anyscale Jun 16, 2022
41a6536
Implement unit tests
brucez-anyscale Jun 16, 2022
d78752f
Change preparing to pending
brucez-anyscale Jun 16, 2022
7ef90e2
goimports
brucez-anyscale Jun 16, 2022
647e505
update
brucez-anyscale Jun 16, 2022
bf20dd1
Improve the pr to show both statuses
brucez-anyscale Jun 17, 2022
b13c707
Improve the pr to show both statuses
brucez-anyscale Jun 17, 2022
3131765
update to align with latest serve status
brucez-anyscale Jun 17, 2022
9c09c34
update
brucez-anyscale Jun 18, 2022
07fde78
Fix ut and imports
brucez-anyscale Jun 18, 2022
beb10b9
update
brucez-anyscale Jun 22, 2022
bbd07fa
update
brucez-anyscale Jun 22, 2022
c5c0b2f
address comments
brucez-anyscale Jun 22, 2022
bd8de73
update
brucez-anyscale Jun 22, 2022
6bc7ab8
update delete ray cluster logic
brucez-anyscale Jun 22, 2022
2c63167
update delete ray cluster logic
brucez-anyscale Jun 22, 2022
38c1ba0
update
brucez-anyscale Jun 22, 2022
96120fb
Merge branch 'master' into brucez/improveHA
brucez-anyscale Jun 24, 2022
b8b1f76
address comments
brucez-anyscale Jun 24, 2022
0cb5879
update
brucez-anyscale Jun 25, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
update
brucez-anyscale committed Jun 14, 2022

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
commit 30b5a52e26a2048f3b5894c3fadbf14a250e65aa
25 changes: 15 additions & 10 deletions ray-operator/controllers/ray/rayservice_controller.go
Original file line number Diff line number Diff line change
@@ -188,7 +188,7 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque
return ctrl.Result{}, err
}
if err := r.reconcileServices(ctx, rayServiceInstance, servingRayClusterInstance); err != nil {
err = r.updateState(ctx, rayServiceInstance, rayv1alpha1.FailUpdateIngress, err)
err = r.updateState(ctx, rayServiceInstance, rayv1alpha1.FailUpdateService, err)
return ctrl.Result{}, err
}

@@ -199,6 +199,7 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque
func (r *RayServiceReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
For(&rayv1alpha1.RayService{}).
Owns(&rayv1alpha1.RayCluster{}).
Complete(r)
}

@@ -238,6 +239,7 @@ func (r *RayServiceReconciler) reconcileRayCluster(ctx context.Context, rayServi
} else if rayClusterInstance.Name == rayServiceInstance.Status.PreparingRayClusterName {
preparingRayCluster = &rayClusterInstance
} else {
r.Log.Info("reconcileRayCluster", "delete ray cluster", rayClusterInstance)
if err := r.Delete(ctx, &rayClusterInstance); err != nil {
return nil, nil, err
}
@@ -470,7 +472,6 @@ func (r *RayServiceReconciler) allServeDeploymentsHealthy(rayServiceInstance *ra
}

func (r *RayServiceReconciler) reconcileIngress(ctx context.Context, rayServiceInstance *rayv1alpha1.RayService, rayClusterInstance *rayv1alpha1.RayCluster) error {
// Enable ingress for RayService by default.
if rayClusterInstance.Spec.HeadGroupSpec.EnableIngress == nil || !*rayClusterInstance.Spec.HeadGroupSpec.EnableIngress {
return nil
}
@@ -481,12 +482,10 @@ func (r *RayServiceReconciler) reconcileIngress(ctx context.Context, rayServiceI
return err
}
ingress.Name = utils.CheckName(ingress.Name)
if err := ctrl.SetControllerReference(rayServiceInstance, ingress, r.Scheme); err != nil {
return err
}

// Get Ingress instance.
headIngress := &networkingv1.Ingress{}
err = r.Get(ctx, client.ObjectKey{Name: utils.GenerateIngressName(rayServiceInstance.Name), Namespace: rayServiceInstance.Namespace}, headIngress)
err = r.Get(ctx, client.ObjectKey{Name: ingress.Name, Namespace: rayServiceInstance.Namespace}, headIngress)

if err == nil {
// Update Ingress
@@ -497,6 +496,9 @@ func (r *RayServiceReconciler) reconcileIngress(ctx context.Context, rayServiceI
}
} else if errors.IsNotFound(err) {
// Create Ingress
if err := ctrl.SetControllerReference(rayServiceInstance, ingress, r.Scheme); err != nil {
return err
}
if createErr := r.Create(ctx, ingress); createErr != nil {
if errors.IsAlreadyExists(createErr) {
log.Info("Ingress already exists,no need to create")
@@ -549,22 +551,25 @@ func (r *RayServiceReconciler) reconcileServices(ctx context.Context, rayService
return err
}
rayHeadSvc.Name = utils.CheckName(rayHeadSvc.Name)
if err := ctrl.SetControllerReference(rayServiceInstance, rayHeadSvc, r.Scheme); err != nil {
return err
}

// Get Service instance.
headService := &corev1.Service{}
err = r.Get(ctx, client.ObjectKey{Name: utils.GenerateServiceName(rayServiceInstance.Name), Namespace: rayServiceInstance.Namespace}, headService)
err = r.Get(ctx, client.ObjectKey{Name: rayHeadSvc.Name, Namespace: rayServiceInstance.Namespace}, headService)

if err == nil {
// Update Service
headService.Spec = rayHeadSvc.Spec
r.Log.Info("reconcileServices", "update service", headService)
if updateErr := r.Update(ctx, headService); updateErr != nil {
r.Log.Error(updateErr, "rayHeadSvc Update error!", "rayHeadSvc.Error", updateErr)
return updateErr
}
} else if errors.IsNotFound(err) {
// Create Service
r.Log.Info("reconcileServices", "create service", rayHeadSvc)
if err := ctrl.SetControllerReference(rayServiceInstance, rayHeadSvc, r.Scheme); err != nil {
return err
}
if createErr := r.Create(ctx, rayHeadSvc); createErr != nil {
if errors.IsAlreadyExists(createErr) {
log.Info("rayHeadSvc already exists,no need to create")