diff --git a/api/v1beta1/solrcloud_types.go b/api/v1beta1/solrcloud_types.go index c0283370..995a9acd 100644 --- a/api/v1beta1/solrcloud_types.go +++ b/api/v1beta1/solrcloud_types.go @@ -1227,6 +1227,13 @@ func (sc *SolrCloud) GetAllSolrPodNames() []string { if sc.Spec.Replicas != nil { replicas = int(*sc.Spec.Replicas) } + if int(sc.Status.Replicas) > replicas { + replicas = int(sc.Status.Replicas) + } + return sc.GetSolrPodNames(replicas) +} + +func (sc *SolrCloud) GetSolrPodNames(replicas int) []string { podNames := make([]string, replicas) statefulSetName := sc.StatefulSetName() for i := range podNames { diff --git a/controllers/solr_cluster_ops_util.go b/controllers/solr_cluster_ops_util.go index c9f352aa..07452ebe 100644 --- a/controllers/solr_cluster_ops_util.go +++ b/controllers/solr_cluster_ops_util.go @@ -151,7 +151,7 @@ func retryNextQueuedClusterOpWithQueue(statefulSet *appsv1.StatefulSet, clusterO return hasOp, err } -func determineScaleClusterOpLockIfNecessary(ctx context.Context, r *SolrCloudReconciler, instance *solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet, scaleDownOpIsQueued bool, podList []corev1.Pod, logger logr.Logger) (clusterOp *SolrClusterOp, retryLaterDuration time.Duration, err error) { +func determineScaleClusterOpLockIfNecessary(ctx context.Context, r *SolrCloudReconciler, instance *solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet, scaleDownOpIsQueued bool, podList []corev1.Pod, blockReconciliationOfStatefulSet bool, logger logr.Logger) (clusterOp *SolrClusterOp, retryLaterDuration time.Duration, err error) { desiredPods := int(*instance.Spec.Replicas) configuredPods := int(*statefulSet.Spec.Replicas) if desiredPods != configuredPods { @@ -170,11 +170,26 @@ func determineScaleClusterOpLockIfNecessary(ctx context.Context, r *SolrCloudRec Metadata: strconv.Itoa(configuredPods - 1), } } else if desiredPods > configuredPods && (instance.Spec.Scaling.PopulatePodsOnScaleUp == nil || *instance.Spec.Scaling.PopulatePodsOnScaleUp) { + // We need to wait for all pods to become healthy to scale up in a managed fashion, otherwise + // the balancing will skip some pods if len(podList) < configuredPods { // There are not enough pods, the statefulSet controller has yet to create the previously desired pods. // Do not start the scale up until these missing pods are created. return nil, time.Second * 5, nil } + // If Solr nodes are advertised by their individual node services (through an ingress) + // then make sure that the host aliases are set for all desired pods before starting a scale-up. + // If the host aliases do not already include the soon-to-be created pods, then Solr might not be able to balance + // replicas onto the new hosts. + // We need to make sure that the StatefulSet is updated with these new hostAliases before the scale up occurs. + if instance.UsesIndividualNodeServices() && instance.Spec.SolrAddressability.External.UseExternalAddress { + for _, pod := range podList { + if len(pod.Spec.HostAliases) < desiredPods { + return nil, time.Second * 5, nil + } + } + } + clusterOp = &SolrClusterOp{ Operation: ScaleUpLock, Metadata: strconv.Itoa(desiredPods), @@ -349,7 +364,8 @@ func handleManagedCloudRollingUpdate(ctx context.Context, r *SolrCloudReconciler } operationComplete = true // Only do a re-balancing for rolling restarts that migrated replicas - if updateMetadata.RequiresReplicaMigration { + // If a scale-up will occur afterwards, skip the re-balancing, because it will occur after the scale-up anyway + if updateMetadata.RequiresReplicaMigration && *instance.Spec.Replicas <= *statefulSet.Spec.Replicas { nextClusterOp = &SolrClusterOp{ Operation: BalanceReplicasLock, Metadata: "RollingUpdateComplete", @@ -371,7 +387,7 @@ func handleManagedCloudRollingUpdate(ctx context.Context, r *SolrCloudReconciler // We won't kill pods that we need the cluster state for, but we can kill the pods that are already not running. // This is important for scenarios where there is a bad pod config and nothing is running, but we need to do // a restart to get a working pod config. - state, retryLater, apiError := util.GetNodeReplicaState(ctx, instance, hasReadyPod, logger) + state, retryLater, apiError := util.GetNodeReplicaState(ctx, instance, statefulSet, hasReadyPod, logger) if apiError != nil { return false, true, 0, nil, apiError } else if !retryLater { diff --git a/controllers/solrcloud_controller.go b/controllers/solrcloud_controller.go index 5ad2213c..4d832cf8 100644 --- a/controllers/solrcloud_controller.go +++ b/controllers/solrcloud_controller.go @@ -152,6 +152,11 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( hostNameIpMap := make(map[string]string) // Generate a service for every Node if instance.UsesIndividualNodeServices() { + // When updating the statefulSet below, the hostNameIpMap is just used to add new IPs or modify existing ones. + // When scaling down, the hostAliases that are no longer found here will not be removed from the hostAliases in the statefulSet pod spec. + // Therefore, it should be ok that we are not reconciling the node services that will be scaled down in the future. + // This is unfortunately the reality since we don't have the statefulSet yet to determine how many Solr pods are still running, + // we just have Spec.replicas which is the requested pod count. for _, nodeName := range solrNodeNames { err, ip := r.reconcileNodeService(ctx, logger, instance, nodeName) if err != nil { @@ -161,6 +166,7 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( if instance.Spec.SolrAddressability.External.UseExternalAddress { if ip == "" { // If we are using this IP in the hostAliases of the statefulSet, it needs to be set for every service before trying to update the statefulSet + // TODO: Make an event here blockReconciliationOfStatefulSet = true } else { hostNameIpMap[instance.AdvertisedNodeHost(nodeName)] = ip @@ -319,36 +325,6 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( } } - extAddressabilityOpts := instance.Spec.SolrAddressability.External - if extAddressabilityOpts != nil && extAddressabilityOpts.Method == solrv1beta1.Ingress { - // Generate Ingress - ingress := util.GenerateIngress(instance, solrNodeNames) - - // Check if the Ingress already exists - ingressLogger := logger.WithValues("ingress", ingress.Name) - foundIngress := &netv1.Ingress{} - err = r.Get(ctx, types.NamespacedName{Name: ingress.Name, Namespace: ingress.Namespace}, foundIngress) - if err != nil && errors.IsNotFound(err) { - ingressLogger.Info("Creating Ingress") - if err = controllerutil.SetControllerReference(instance, ingress, r.Scheme); err == nil { - err = r.Create(ctx, ingress) - } - } else if err == nil { - var needsUpdate bool - needsUpdate, err = util.OvertakeControllerRef(instance, foundIngress, r.Scheme) - needsUpdate = util.CopyIngressFields(ingress, foundIngress, ingressLogger) || needsUpdate - - // Update the found Ingress and write the result back if there are any changes - if needsUpdate && err == nil { - ingressLogger.Info("Updating Ingress") - err = r.Update(ctx, foundIngress) - } - } - if err != nil { - return requeueOrNot, err - } - } - var statefulSet *appsv1.StatefulSet if !blockReconciliationOfStatefulSet { @@ -416,6 +392,39 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( if err != nil { return requeueOrNot, err } + if statefulSet != nil && statefulSet.Spec.Replicas != nil { + solrNodeNames = instance.GetSolrPodNames(int(*statefulSet.Spec.Replicas)) + } + + extAddressabilityOpts := instance.Spec.SolrAddressability.External + if extAddressabilityOpts != nil && extAddressabilityOpts.Method == solrv1beta1.Ingress { + // Generate Ingress + ingress := util.GenerateIngress(instance, solrNodeNames) + + // Check if the Ingress already exists + ingressLogger := logger.WithValues("ingress", ingress.Name) + foundIngress := &netv1.Ingress{} + err = r.Get(ctx, types.NamespacedName{Name: ingress.Name, Namespace: ingress.Namespace}, foundIngress) + if err != nil && errors.IsNotFound(err) { + ingressLogger.Info("Creating Ingress") + if err = controllerutil.SetControllerReference(instance, ingress, r.Scheme); err == nil { + err = r.Create(ctx, ingress) + } + } else if err == nil { + var needsUpdate bool + needsUpdate, err = util.OvertakeControllerRef(instance, foundIngress, r.Scheme) + needsUpdate = util.CopyIngressFields(ingress, foundIngress, ingressLogger) || needsUpdate + + // Update the found Ingress and write the result back if there are any changes + if needsUpdate && err == nil { + ingressLogger.Info("Updating Ingress") + err = r.Update(ctx, foundIngress) + } + } + if err != nil { + return requeueOrNot, err + } + } // ********************************************************* // The operations after this require a statefulSet to exist, @@ -546,7 +555,7 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( // a "locked" cluster operation if clusterOp == nil { _, scaleDownOpIsQueued := queuedRetryOps[ScaleDownLock] - clusterOp, retryLaterDuration, err = determineScaleClusterOpLockIfNecessary(ctx, r, instance, statefulSet, scaleDownOpIsQueued, podList, logger) + clusterOp, retryLaterDuration, err = determineScaleClusterOpLockIfNecessary(ctx, r, instance, statefulSet, scaleDownOpIsQueued, podList, blockReconciliationOfStatefulSet, logger) // If the new clusterOperation is an update to a queued clusterOp, just change the operation that is already queued if clusterOp != nil { diff --git a/controllers/solrcloud_controller_basic_auth_test.go b/controllers/solrcloud_controller_basic_auth_test.go index a91846f2..cd3fbf5f 100644 --- a/controllers/solrcloud_controller_basic_auth_test.go +++ b/controllers/solrcloud_controller_basic_auth_test.go @@ -20,6 +20,7 @@ package controllers import ( "context" "fmt" + solrv1beta1 "github.com/apache/solr-operator/api/v1beta1" "github.com/apache/solr-operator/controllers/util" . "github.com/onsi/ginkgo/v2" @@ -289,10 +290,10 @@ func expectBasicAuthConfigOnPodTemplateWithGomega(g Gomega, solrCloud *solrv1bet g.Expect(basicAuthSecretVolMount.MountPath).To(Equal("/etc/secrets/"+secretName), "Wrong path used to mount Basic Auth volume") expLivenessProbeCmd := fmt.Sprintf("JAVA_TOOL_OPTIONS=\"-Dbasicauth=$(cat /etc/secrets/%s-solrcloud-basic-auth/username):$(cat /etc/secrets/%s-solrcloud-basic-auth/password) -Dsolr.httpclient.builder.factory=org.apache.solr.client.solrj.impl.PreemptiveBasicAuthClientBuilderFactory\" "+ - "solr api -get \"http://${SOLR_HOST}:8983%s\"", + "solr api -get \"http://${SOLR_HOST}:8983%s\" 2>&1 | grep -v JAVA_TOOL_OPTIONS", solrCloud.Name, solrCloud.Name, expLivenessProbePath) expReadinessProbeCmd := fmt.Sprintf("JAVA_TOOL_OPTIONS=\"-Dbasicauth=$(cat /etc/secrets/%s-solrcloud-basic-auth/username):$(cat /etc/secrets/%s-solrcloud-basic-auth/password) -Dsolr.httpclient.builder.factory=org.apache.solr.client.solrj.impl.PreemptiveBasicAuthClientBuilderFactory\" "+ - "solr api -get \"http://${SOLR_HOST}:8983%s\"", + "solr api -get \"http://${SOLR_HOST}:8983%s\" 2>&1 | grep -v JAVA_TOOL_OPTIONS", solrCloud.Name, solrCloud.Name, expReadinessProbePath) g.Expect(mainContainer.LivenessProbe).To(Not(BeNil()), "main container should have a liveness probe defined") @@ -350,7 +351,7 @@ func expectBasicAuthConfigOnPodTemplateWithGomega(g Gomega, solrCloud *solrv1bet func expectPutSecurityJsonInZkCmd(g Gomega, expInitContainer *corev1.Container) { g.Expect(expInitContainer).To(Not(BeNil()), "Didn't find the setup-zk InitContainer in the sts!") - expCmd := "ZK_SECURITY_JSON=$(/opt/solr/server/scripts/cloud-scripts/zkcli.sh -zkhost ${ZK_HOST} -cmd get /security.json); " + + expCmd := "ZK_SECURITY_JSON=$(/opt/solr/server/scripts/cloud-scripts/zkcli.sh -zkhost ${ZK_HOST} -cmd get /security.json || echo 'failed-to-get-security.json'); " + "if [ ${#ZK_SECURITY_JSON} -lt 3 ]; then " + "echo $SECURITY_JSON > /tmp/security.json; " + "/opt/solr/server/scripts/cloud-scripts/zkcli.sh -zkhost ${ZK_HOST} -cmd putfile /security.json /tmp/security.json; echo \"put security.json in ZK\"; fi" diff --git a/controllers/util/common.go b/controllers/util/common.go index b887f6df..3eca00b3 100644 --- a/controllers/util/common.go +++ b/controllers/util/common.go @@ -432,7 +432,29 @@ func CopyPodTemplates(from, to *corev1.PodTemplateSpec, basePath string, logger if !DeepEqualWithNils(to.Spec.HostAliases, from.Spec.HostAliases) { requireUpdate = true - to.Spec.HostAliases = from.Spec.HostAliases + if to.Spec.HostAliases == nil { + to.Spec.HostAliases = from.Spec.HostAliases + } else { + // Do not remove aliases that are no longer used. + // This is in case Solr is scaling down and we want to keep the old addresses for future use. + for _, fromAlias := range from.Spec.HostAliases { + found := false + for i, toAlias := range to.Spec.HostAliases { + if fromAlias.Hostnames[0] == toAlias.Hostnames[0] { + found = true + if !DeepEqualWithNils(toAlias, fromAlias) { + requireUpdate = true + to.Spec.HostAliases[i] = fromAlias + break + } + } + } + if !found { + requireUpdate = true + to.Spec.HostAliases = append(to.Spec.HostAliases, fromAlias) + } + } + } logger.Info("Update required because field changed", "field", basePath+"Spec.HostAliases", "from", to.Spec.HostAliases, "to", from.Spec.HostAliases) } diff --git a/controllers/util/solr_security_util.go b/controllers/util/solr_security_util.go index 51caa313..97f7e602 100644 --- a/controllers/util/solr_security_util.go +++ b/controllers/util/solr_security_util.go @@ -238,7 +238,7 @@ func addHostHeaderToProbe(httpGet *corev1.HTTPGetAction, host string) { func cmdToPutSecurityJsonInZk() string { scriptsDir := "/opt/solr/server/scripts/cloud-scripts" - cmd := " ZK_SECURITY_JSON=$(%s/zkcli.sh -zkhost ${ZK_HOST} -cmd get /security.json); " + cmd := " ZK_SECURITY_JSON=$(%s/zkcli.sh -zkhost ${ZK_HOST} -cmd get /security.json || echo 'failed-to-get-security.json'); " cmd += "if [ ${#ZK_SECURITY_JSON} -lt 3 ]; then echo $SECURITY_JSON > /tmp/security.json; %s/zkcli.sh -zkhost ${ZK_HOST} -cmd putfile /security.json /tmp/security.json; echo \"put security.json in ZK\"; fi" return fmt.Sprintf(cmd, scriptsDir, scriptsDir) } @@ -496,13 +496,16 @@ func useSecureProbe(solrCloud *solr.SolrCloud, probe *corev1.Probe, mountPath st // Future work - SOLR_TOOL_OPTIONS is only in 9.4.0, use JAVA_TOOL_OPTIONS until that is the minimum supported version var javaToolOptionsStr string + var javaToolOptionsOutputFilter string if len(javaToolOptions) > 0 { javaToolOptionsStr = fmt.Sprintf("JAVA_TOOL_OPTIONS=%q ", strings.Join(javaToolOptions, " ")) + javaToolOptionsOutputFilter = " 2>&1 | grep -v JAVA_TOOL_OPTIONS" } else { javaToolOptionsStr = "" + javaToolOptionsOutputFilter = "" } - probeCommand := fmt.Sprintf("%ssolr api -get \"%s://${SOLR_HOST}:%d%s\"", javaToolOptionsStr, solrCloud.UrlScheme(false), probe.HTTPGet.Port.IntVal, probe.HTTPGet.Path) + probeCommand := fmt.Sprintf("%ssolr api -get \"%s://${SOLR_HOST}:%d%s\"%s", javaToolOptionsStr, solrCloud.UrlScheme(false), probe.HTTPGet.Port.IntVal, probe.HTTPGet.Path, javaToolOptionsOutputFilter) probeCommand = regexp.MustCompile(`\s+`).ReplaceAllString(strings.TrimSpace(probeCommand), " ") // use an Exec instead of an HTTP GET diff --git a/controllers/util/solr_update_util.go b/controllers/util/solr_update_util.go index 4f9eebe3..31235441 100644 --- a/controllers/util/solr_update_util.go +++ b/controllers/util/solr_update_util.go @@ -24,6 +24,7 @@ import ( "github.com/apache/solr-operator/controllers/util/solr_api" "github.com/go-logr/logr" "github.com/robfig/cron/v3" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/intstr" "net/url" @@ -119,7 +120,7 @@ func (state NodeReplicaState) PodHasReplicas(cloud *solr.SolrCloud, podName stri return isInClusterState && contents.replicas > 0 } -func GetNodeReplicaState(ctx context.Context, cloud *solr.SolrCloud, hasReadyPod bool, logger logr.Logger) (state NodeReplicaState, retryLater bool, err error) { +func GetNodeReplicaState(ctx context.Context, cloud *solr.SolrCloud, statefulSet *appsv1.StatefulSet, hasReadyPod bool, logger logr.Logger) (state NodeReplicaState, retryLater bool, err error) { clusterResp := &solr_api.SolrClusterStatusResponse{} overseerResp := &solr_api.SolrOverseerStatusResponse{} @@ -138,7 +139,7 @@ func GetNodeReplicaState(ctx context.Context, cloud *solr.SolrCloud, hasReadyPod } } if err == nil { - state = findSolrNodeContents(clusterResp.ClusterStatus, overseerResp.Leader, GetAllManagedSolrNodeNames(cloud)) + state = findSolrNodeContents(clusterResp.ClusterStatus, overseerResp.Leader, GetManagedSolrNodeNames(cloud, int(*statefulSet.Spec.Replicas))) } else { logger.Error(err, "Could not fetch cluster state information for cloud") } @@ -535,6 +536,15 @@ func GetAllManagedSolrNodeNames(solrCloud *solr.SolrCloud) map[string]bool { return allNodeNames } +func GetManagedSolrNodeNames(solrCloud *solr.SolrCloud, currentlyConfiguredPodCount int) map[string]bool { + podNames := solrCloud.GetSolrPodNames(currentlyConfiguredPodCount) + allNodeNames := make(map[string]bool, len(podNames)) + for _, podName := range podNames { + allNodeNames[SolrNodeName(solrCloud, podName)] = true + } + return allNodeNames +} + // EvictReplicasForPodIfNecessary takes a solr Pod and migrates all replicas off of that Pod. // For updates this will only be called for pods using ephemeral data. // For scale-down operations, this can be called for pods using ephemeral or persistent data. diff --git a/controllers/util/solr_util.go b/controllers/util/solr_util.go index 47f8c070..de44d7c1 100644 --- a/controllers/util/solr_util.go +++ b/controllers/util/solr_util.go @@ -556,6 +556,9 @@ func GenerateStatefulSet(solrCloud *solr.SolrCloud, solrCloudStatus *solr.SolrCl VolumeClaimTemplates: pvcs, }, } + if solrCloud.UsesHeadlessService() { + stateful.Spec.Template.Spec.Subdomain = solrCloud.HeadlessServiceName() + } var imagePullSecrets []corev1.LocalObjectReference diff --git a/helm/solr-operator/Chart.yaml b/helm/solr-operator/Chart.yaml index 83adf370..5a954c28 100644 --- a/helm/solr-operator/Chart.yaml +++ b/helm/solr-operator/Chart.yaml @@ -61,6 +61,12 @@ annotations: url: https://github.com/apache/solr-operator/issues/624 - name: Github PR url: https://github.com/apache/solr-operator/pull/648 + description: Avoid reset of security.json if get request fails + links: + - name: Github Issue + url: https://github.com/apache/solr-operator/issues/659 + - name: Github PR + url: https://github.com/apache/solr-operator/pull/660 - kind: fixed description: SolrCloud scaling is now safe when using persistent storage with a 'Delete' reclaim policy links: @@ -75,6 +81,20 @@ annotations: url: https://github.com/apache/solr-operator/issues/693 - name: Github PR url: https://github.com/apache/solr-operator/pull/694 + - kind: security + description: Command-based Solr probes no longer echoes 'JAVA_TOOL_OPTIONS' values in Kubernetes events. + links: + - name: JIRA Issue + url: https://issues.apache.org/jira/browse/SOLR-17216 + - name: Github PR + url: https://github.com/apache/solr-operator/pull/698 + - kind: fixed + description: SolrClouds addressed via an Ingress now scale up and down safely. + links: + - name: Github Issue + url: https://github.com/apache/solr-operator/issues/682 + - name: Github PR + url: https://github.com/apache/solr-operator/pull/692 artifacthub.io/images: | - name: solr-operator image: apache/solr-operator:v0.9.0-prerelease diff --git a/tests/e2e/solrcloud_rolling_upgrade_test.go b/tests/e2e/solrcloud_rolling_upgrade_test.go index 6143342a..faf2175e 100644 --- a/tests/e2e/solrcloud_rolling_upgrade_test.go +++ b/tests/e2e/solrcloud_rolling_upgrade_test.go @@ -163,7 +163,7 @@ var _ = FDescribe("E2E - SolrCloud - Rolling Upgrades", func() { } By("waiting for the balanceReplicas to finish") - expectStatefulSetWithChecks(ctx, solrCloud, solrCloud.StatefulSetName(), func(g Gomega, found *appsv1.StatefulSet) { + expectStatefulSetWithChecksAndTimeout(ctx, solrCloud, solrCloud.StatefulSetName(), time.Second*30, time.Second, func(g Gomega, found *appsv1.StatefulSet) { clusterOp, err := controllers.GetCurrentClusterOp(found) g.Expect(err).ToNot(HaveOccurred(), "Error occurred while finding clusterLock for SolrCloud") g.Expect(clusterOp).To(BeNil(), "StatefulSet should not have a balanceReplicas lock after balancing is complete.") diff --git a/tests/e2e/solrcloud_scaling_test.go b/tests/e2e/solrcloud_scaling_test.go index 53d36308..106dd4fb 100644 --- a/tests/e2e/solrcloud_scaling_test.go +++ b/tests/e2e/solrcloud_scaling_test.go @@ -140,6 +140,110 @@ var _ = FDescribe("E2E - SolrCloud - Scale Down", func() { }) }) + FContext("with replica migration using an ingress for node addresses", func() { + + BeforeEach(func() { + solrCloud.Spec.SolrAddressability = solrv1beta1.SolrAddressabilityOptions{ + External: &solrv1beta1.ExternalAddressability{ + Method: solrv1beta1.Ingress, + UseExternalAddress: true, + DomainName: "test.solr.org", + }, + } + }) + + FIt("Scales Down", func(ctx context.Context) { + originalSolrCloud := solrCloud.DeepCopy() + solrCloud.Spec.Replicas = pointer.Int32(1) + By("triggering a scale down via solrCloud replicas") + Expect(k8sClient.Patch(ctx, solrCloud, client.MergeFrom(originalSolrCloud))).To(Succeed(), "Could not patch SolrCloud replicas to initiate scale down") + + By("waiting for the scaleDown of first pod to begin") + expectStatefulSetWithChecks(ctx, solrCloud, solrCloud.StatefulSetName(), func(g Gomega, found *appsv1.StatefulSet) { + g.Expect(found.Spec.Replicas).To(HaveValue(BeEquivalentTo(3)), "StatefulSet should still have 3 pods, because the scale down should first move Solr replicas") + clusterOp, err := controllers.GetCurrentClusterOp(found) + g.Expect(err).ToNot(HaveOccurred(), "Error occurred while finding clusterLock for SolrCloud") + g.Expect(clusterOp).ToNot(BeNil(), "StatefulSet does not have a scaleDown lock.") + g.Expect(clusterOp.Operation).To(Equal(controllers.ScaleDownLock), "StatefulSet does not have a scaleDown lock.") + g.Expect(clusterOp.Metadata).To(Equal("2"), "StatefulSet scaling lock operation has the wrong metadata.") + }) + queryCollection(ctx, solrCloud, solrCollection2, 0) + + // Make sure that ingress still has 3 nodes at this point + ingress := expectIngress(ctx, solrCloud, solrCloud.CommonIngressName()) + Expect(ingress.Spec.Rules).To(HaveLen(4), "Wrong number of ingress rules. 3 nodes + 1 common endpoint. The third node rule should not be deleted until the node itself is deleted") + + By("waiting for the scaleDown of the first pod to finish") + expectStatefulSetWithChecks(ctx, solrCloud, solrCloud.StatefulSetName(), func(g Gomega, found *appsv1.StatefulSet) { + g.Expect(found.Spec.Replicas).To(HaveValue(BeEquivalentTo(2)), "StatefulSet should now have 2 pods, after the replicas have been moved off the first pod.") + clusterOp, err := controllers.GetCurrentClusterOp(found) + g.Expect(err).ToNot(HaveOccurred(), "Error occurred while finding clusterLock for SolrCloud") + g.Expect(clusterOp).ToNot(BeNil(), "StatefulSet does not have a scaleDown lock.") + g.Expect(clusterOp.Operation).To(Equal(controllers.ScaleDownLock), "StatefulSet does not have a scaleDown lock.") + g.Expect(clusterOp.Metadata).To(Equal("2"), "StatefulSet scaling lock operation has the wrong metadata.") + }) + queryCollection(ctx, solrCloud, solrCollection2, 0) + + // Wait till the pod has actually been deleted + expectStatefulSetWithChecks(ctx, solrCloud, solrCloud.StatefulSetName(), func(g Gomega, found *appsv1.StatefulSet) { + g.Expect(found.Status.Replicas).To(HaveValue(BeEquivalentTo(2)), "StatefulSet should now have 2 pods, after the replicas have been moved off the first pod.") + }) + + By("waiting for the scaleDown of second pod to begin") + statefulSet := expectStatefulSetWithChecks(ctx, solrCloud, solrCloud.StatefulSetName(), func(g Gomega, found *appsv1.StatefulSet) { + clusterOp, err := controllers.GetCurrentClusterOp(found) + g.Expect(err).ToNot(HaveOccurred(), "Error occurred while finding clusterLock for SolrCloud") + g.Expect(clusterOp).ToNot(BeNil(), "StatefulSet does not have a scaleDown lock.") + g.Expect(clusterOp.Operation).To(Equal(controllers.ScaleDownLock), "StatefulSet does not have a scaleDown lock.") + g.Expect(clusterOp.Metadata).To(Equal("1"), "StatefulSet scaling lock operation has the wrong metadata.") + }) + + // Make sure that ingress still has 2 nodes at this point + ingress = expectIngress(ctx, solrCloud, solrCloud.CommonIngressName()) + Expect(ingress.Spec.Rules).To(HaveLen(3), "Wrong number of ingress rules. 2 nodes + 1 common endpoint. The second node rule should not be deleted until the node itself is deleted") + + // When the next scale down happens, the 3rd solr pod (ordinal 2) should be gone, and the statefulSet replicas should be 2 across the board. + // The first scale down should not be complete until this is done. + Expect(statefulSet.Spec.Replicas).To(HaveValue(BeEquivalentTo(2)), "StatefulSet should still have 2 pods configured, because the scale down should first move Solr replicas") + Expect(statefulSet.Status.Replicas).To(HaveValue(BeEquivalentTo(2)), "StatefulSet should only have 2 pods running, because previous pod scale down should have completely finished") + // This pod check must happen after the above clusterLock and replicas check. + // The StatefulSet controller might take a good amount of time to actually delete the pod, + // and the replica migration/cluster op might already be done by the time the first pod is deleted. + expectNoPodNow(ctx, solrCloud, solrCloud.GetSolrPodName(2)) + queryCollection(ctx, solrCloud, solrCollection1, 0) + + By("waiting for the scaleDown to finish") + statefulSet = expectStatefulSetWithChecks(ctx, solrCloud, solrCloud.StatefulSetName(), func(g Gomega, found *appsv1.StatefulSet) { + g.Expect(found.Spec.Replicas).To(HaveValue(BeEquivalentTo(1)), "StatefulSet should now have 1 pods, after the replicas have been moved.") + }) + // Once the scale down actually occurs, the clusterOp is not complete. We need to wait till the last pod is deleted + clusterOp, err := controllers.GetCurrentClusterOp(statefulSet) + Expect(clusterOp).ToNot(BeNil(), "StatefulSet does not have a scaleDown lock.") + Expect(clusterOp.Operation).To(Equal(controllers.ScaleDownLock), "StatefulSet does not have a scaleDown lock.") + Expect(clusterOp.Metadata).To(Equal("1"), "StatefulSet scaling lock operation has the wrong metadata.") + + // Wait for the last pod to be deleted + expectStatefulSetWithChecks(ctx, solrCloud, solrCloud.StatefulSetName(), func(g Gomega, found *appsv1.StatefulSet) { + g.Expect(found.Status.Replicas).To(HaveValue(BeEquivalentTo(1)), "StatefulSet should now have 1 pods, after the replicas have been moved.") + }) + // Once the scale down actually occurs, the statefulSet annotations should be removed very soon + expectStatefulSetWithChecksAndTimeout(ctx, solrCloud, solrCloud.StatefulSetName(), time.Second*2, time.Millisecond*500, func(g Gomega, found *appsv1.StatefulSet) { + clusterOp, err = controllers.GetCurrentClusterOp(found) + g.Expect(err).ToNot(HaveOccurred(), "Error occurred while finding clusterLock for SolrCloud") + g.Expect(clusterOp).To(BeNil(), "StatefulSet should not have a ScaleDown lock after scaling is complete.") + }) + + // Make sure that ingress has 1 node at this point + ingress = expectIngress(ctx, solrCloud, solrCloud.CommonIngressName()) + Expect(ingress.Spec.Rules).To(HaveLen(2), "Wrong number of ingress rules. 1 nodes + 1 common endpoint.") + + expectNoPod(ctx, solrCloud, solrCloud.GetSolrPodName(1)) + + queryCollection(ctx, solrCloud, solrCollection1, 0) + queryCollection(ctx, solrCloud, solrCollection2, 0) + }) + }) + FContext("with replica migration and deleted persistent data", func() { BeforeEach(func() { @@ -240,7 +344,7 @@ var _ = FDescribe("E2E - SolrCloud - Scale Up", func() { FIt("Scales Up", func(ctx context.Context) { originalSolrCloud := solrCloud.DeepCopy() solrCloud.Spec.Replicas = pointer.Int32(int32(3)) - By("triggering a scale down via solrCloud replicas") + By("triggering a scale up via solrCloud replicas") Expect(k8sClient.Patch(ctx, solrCloud, client.MergeFrom(originalSolrCloud))).To(Succeed(), "Could not patch SolrCloud replicas to initiate scale up") By("waiting for the scaleUp to begin") @@ -254,9 +358,11 @@ var _ = FDescribe("E2E - SolrCloud - Scale Up", func() { // The first step is to increase the number of pods // Check very often, as the new pods will be created quickly, which will cause the cluster op to change. - statefulSet = expectStatefulSetWithChecksAndTimeout(ctx, solrCloud, solrCloud.StatefulSetName(), time.Second*5, time.Millisecond*5, func(g Gomega, found *appsv1.StatefulSet) { - g.Expect(found.Spec.Replicas).To(HaveValue(BeEquivalentTo(3)), "StatefulSet should still have 3 pods, because the scale down should first move Solr replicas") - }) + if int(*statefulSet.Spec.Replicas) != 3 { + statefulSet = expectStatefulSetWithChecksAndTimeout(ctx, solrCloud, solrCloud.StatefulSetName(), time.Second*5, time.Millisecond*5, func(g Gomega, found *appsv1.StatefulSet) { + g.Expect(found.Spec.Replicas).To(HaveValue(BeEquivalentTo(3)), "StatefulSet should be configured for 3 pods when scaling up") + }) + } clusterOp, err := controllers.GetCurrentClusterOp(statefulSet) Expect(err).ToNot(HaveOccurred(), "Error occurred while finding clusterLock for SolrCloud") Expect(clusterOp).ToNot(BeNil(), "StatefulSet does not have a scaleUp lock.") @@ -265,7 +371,7 @@ var _ = FDescribe("E2E - SolrCloud - Scale Up", func() { // Wait for new pods to come up, and when they do we should be doing a balanceReplicas clusterOp statefulSet = expectStatefulSetWithChecks(ctx, solrCloud, solrCloud.StatefulSetName(), func(g Gomega, found *appsv1.StatefulSet) { - g.Expect(found.Status.Replicas).To(HaveValue(BeEquivalentTo(3)), "StatefulSet should still have 3 pods, because the scale down should first move Solr replicas") + g.Expect(found.Status.Replicas).To(HaveValue(BeEquivalentTo(3)), "StatefulSet should eventually have 3 pods created for it") }) clusterOp, err = controllers.GetCurrentClusterOp(statefulSet) Expect(err).ToNot(HaveOccurred(), "Error occurred while finding clusterLock for SolrCloud") @@ -285,6 +391,89 @@ var _ = FDescribe("E2E - SolrCloud - Scale Up", func() { }) }) + FContext("with replica migration using an ingress for node addresses", func() { + + BeforeEach(func() { + solrCloud.Spec.Replicas = pointer.Int32(2) + solrCloud.Spec.SolrAddressability = solrv1beta1.SolrAddressabilityOptions{ + External: &solrv1beta1.ExternalAddressability{ + Method: solrv1beta1.Ingress, + UseExternalAddress: true, + DomainName: "test.solr.org", + }, + } + }) + + FIt("Scales Up", func(ctx context.Context) { + originalSolrCloud := solrCloud.DeepCopy() + solrCloud.Spec.Replicas = pointer.Int32(int32(3)) + By("triggering a scale up via solrCloud replicas") + Expect(k8sClient.Patch(ctx, solrCloud, client.MergeFrom(originalSolrCloud))).To(Succeed(), "Could not patch SolrCloud replicas to initiate scale up") + + By("waiting for the rolling restart to begin with hostAliases") + statefulSet := expectStatefulSetWithChecksAndTimeout(ctx, solrCloud, solrCloud.StatefulSetName(), time.Second*5, time.Millisecond*5, func(g Gomega, found *appsv1.StatefulSet) { + clusterOp, err := controllers.GetCurrentClusterOp(found) + g.Expect(err).ToNot(HaveOccurred(), "Error occurred while finding clusterLock for SolrCloud") + g.Expect(clusterOp).ToNot(BeNil(), "StatefulSet does not have a update lock to add hostAliases.") + g.Expect(clusterOp.Operation).To(Equal(controllers.UpdateLock), "StatefulSet does not have a update lock to add hostAliases.") + g.Expect(found.Spec.Template.Spec.HostAliases).To(HaveLen(3), "Host aliases in the pod template do not have length 3, which is the number of pods to scale up to") + }) + expectSolrCloudWithChecksAndTimeout(ctx, solrCloud, time.Second*5, time.Millisecond*5, func(g Gomega, cloud *solrv1beta1.SolrCloud) { + g.Expect(cloud.Status.UpToDateNodes).To(BeEquivalentTo(0), "The Rolling Update never started, upToDateNodes should eventually be 0 when starting a restart") + }) + + By("Wait for the rolling update to be done") + expectSolrCloudWithChecksAndTimeout(ctx, solrCloud, time.Second*90, time.Millisecond*5, func(g Gomega, cloud *solrv1beta1.SolrCloud) { + g.Expect(cloud.Status.UpToDateNodes).To(BeEquivalentTo(*statefulSet.Spec.Replicas), "The Rolling Update never completed, not all replicas up to date") + g.Expect(cloud.Status.ReadyReplicas).To(BeEquivalentTo(*statefulSet.Spec.Replicas), "The Rolling Update never completed, not all replicas ready") + }) + + By("waiting for the scaleUp to begin") + statefulSet = expectStatefulSetWithChecksAndTimeout(ctx, solrCloud, solrCloud.StatefulSetName(), time.Second*5, time.Millisecond, func(g Gomega, found *appsv1.StatefulSet) { + clusterOp, err := controllers.GetCurrentClusterOp(found) + g.Expect(err).ToNot(HaveOccurred(), "Error occurred while finding clusterLock for SolrCloud") + g.Expect(clusterOp).ToNot(BeNil(), "StatefulSet does not have a scaleUp lock.") + g.Expect(clusterOp.Operation).To(Equal(controllers.ScaleUpLock), "StatefulSet does not have a scaleUp lock.") + g.Expect(clusterOp.Metadata).To(Equal("3"), "StatefulSet scaling lock operation has the wrong metadata.") + }) + + // The first step is to increase the number of pods + // Check very often, as the new pods will be created quickly, which will cause the cluster op to change. + if int(*statefulSet.Spec.Replicas) != 3 { + statefulSet = expectStatefulSetWithChecksAndTimeout(ctx, solrCloud, solrCloud.StatefulSetName(), time.Second*5, time.Millisecond*5, func(g Gomega, found *appsv1.StatefulSet) { + g.Expect(found.Spec.Replicas).To(HaveValue(BeEquivalentTo(3)), "StatefulSet should be configured for 3 pods when scaling up") + }) + } + clusterOp, err := controllers.GetCurrentClusterOp(statefulSet) + Expect(err).ToNot(HaveOccurred(), "Error occurred while finding clusterLock for SolrCloud") + Expect(clusterOp).ToNot(BeNil(), "StatefulSet does not have a scaleUp lock.") + Expect(clusterOp.Operation).To(Equal(controllers.ScaleUpLock), "StatefulSet does not have a scaleUp lock.") + Expect(clusterOp.Metadata).To(Equal("3"), "StatefulSet scaling lock operation has the wrong metadata.") + + // Wait for new pods to come up, and when they do we should be doing a balanceReplicas clusterOp + statefulSet = expectStatefulSetWithChecks(ctx, solrCloud, solrCloud.StatefulSetName(), func(g Gomega, found *appsv1.StatefulSet) { + g.Expect(found.Status.Replicas).To(HaveValue(BeEquivalentTo(3)), "StatefulSet should eventually have 3 pods created for it") + }) + statefulSet = expectStatefulSetWithChecksAndTimeout(ctx, solrCloud, solrCloud.StatefulSetName(), time.Millisecond*50, time.Millisecond*10, func(g Gomega, found *appsv1.StatefulSet) { + clusterOp, err = controllers.GetCurrentClusterOp(found) + g.Expect(err).ToNot(HaveOccurred(), "Error occurred while finding clusterLock for SolrCloud") + g.Expect(clusterOp).ToNot(BeNil(), "StatefulSet does not have a balanceReplicas lock after new pods are created.") + g.Expect(clusterOp.Operation).To(Equal(controllers.BalanceReplicasLock), "StatefulSet does not have a balanceReplicas lock after new pods are created.") + g.Expect(clusterOp.Metadata).To(Equal("ScaleUp"), "StatefulSet balanceReplicas lock operation has the wrong metadata.") + }) + + By("waiting for the scaleUp to finish") + statefulSet = expectStatefulSetWithChecks(ctx, solrCloud, solrCloud.StatefulSetName(), func(g Gomega, found *appsv1.StatefulSet) { + clusterOp, err := controllers.GetCurrentClusterOp(found) + g.Expect(err).ToNot(HaveOccurred(), "Error occurred while finding clusterLock for SolrCloud") + g.Expect(clusterOp).To(BeNil(), "StatefulSet should not have a balanceReplicas lock after balancing is complete.") + }) + + queryCollection(ctx, solrCloud, solrCollection1, 0) + queryCollection(ctx, solrCloud, solrCollection2, 0) + }) + }) + FContext("without replica migration", func() { BeforeEach(func() { diff --git a/tests/e2e/suite_test.go b/tests/e2e/suite_test.go index f27a90f0..8b38a98e 100644 --- a/tests/e2e/suite_test.go +++ b/tests/e2e/suite_test.go @@ -311,6 +311,26 @@ func writeAllSolrInfoToFiles(ctx context.Context, directory string, namespace st &statefulSet, ) } + + // Unfortunately the services don't have the technology label + req, err = labels.NewRequirement("solr-cloud", selection.Exists, make([]string, 0)) + Expect(err).ToNot(HaveOccurred()) + + labelSelector = labels.Everything().Add(*req) + listOps = &client.ListOptions{ + Namespace: namespace, + LabelSelector: labelSelector, + } + + foundServices := &corev1.ServiceList{} + Expect(k8sClient.List(ctx, foundServices, listOps)).To(Succeed(), "Could not fetch Solr pods") + Expect(foundServices).ToNot(BeNil(), "No Solr services could be found") + for _, service := range foundServices.Items { + writeAllServiceInfoToFiles( + directory+service.Name+".service", + &service, + ) + } } // writeAllStatefulSetInfoToFiles writes the following each to a separate file with the given base name & directory. @@ -339,6 +359,19 @@ func writeAllStatefulSetInfoToFiles(baseFilename string, statefulSet *appsv1.Sta Expect(writeErr).ToNot(HaveOccurred(), "Could not write statefulSet events json to file") } +// writeAllServiceInfoToFiles writes the following each to a separate file with the given base name & directory. +// - Service +func writeAllServiceInfoToFiles(baseFilename string, service *corev1.Service) { + // Write service to a file + statusFile, err := os.Create(baseFilename + ".json") + defer statusFile.Close() + Expect(err).ToNot(HaveOccurred(), "Could not open file to save service status: %s", baseFilename+".json") + jsonBytes, marshErr := json.MarshalIndent(service, "", "\t") + Expect(marshErr).ToNot(HaveOccurred(), "Could not serialize service json") + _, writeErr := statusFile.Write(jsonBytes) + Expect(writeErr).ToNot(HaveOccurred(), "Could not write service json to file") +} + // writeAllPodInfoToFile writes the following each to a separate file with the given base name & directory. // - Pod Spec/Status // - Pod Events diff --git a/tests/e2e/test_utils_test.go b/tests/e2e/test_utils_test.go index dbe76d4a..f287250c 100644 --- a/tests/e2e/test_utils_test.go +++ b/tests/e2e/test_utils_test.go @@ -476,10 +476,10 @@ func callSolrApiInPod(ctx context.Context, solrCloud *solrv1beta1.SolrCloud, htt "-verbose", "-" + strings.ToLower(httpMethod), fmt.Sprintf( - "\"%s://%s:%d%s%s\"", + "\"%s://%s%s%s%s\"", solrCloud.UrlScheme(false), hostname, - solrCloud.Spec.SolrAddressability.PodPort, + solrCloud.NodePortSuffix(false), apiPath, queryParamsString), }