From 1a876293e48a130a9cd51d22cb672911d4750569 Mon Sep 17 00:00:00 2001 From: taroface Date: Wed, 22 Jul 2020 11:57:30 -0400 Subject: [PATCH 1/7] add statefulset config for EKS multi-region --- .../cockroachdb-statefulset-secure-eks.yaml | 281 ++++++++++++++++++ 1 file changed, 281 insertions(+) create mode 100644 cloud/kubernetes/multiregion/cockroachdb-statefulset-secure-eks.yaml diff --git a/cloud/kubernetes/multiregion/cockroachdb-statefulset-secure-eks.yaml b/cloud/kubernetes/multiregion/cockroachdb-statefulset-secure-eks.yaml new file mode 100644 index 000000000000..519045733ec0 --- /dev/null +++ b/cloud/kubernetes/multiregion/cockroachdb-statefulset-secure-eks.yaml @@ -0,0 +1,281 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: cockroachdb + labels: + app: cockroachdb +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: Role +metadata: + name: cockroachdb + labels: + app: cockroachdb +rules: +- apiGroups: + - "" + resources: + - secrets + verbs: + - create + - get +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: cockroachdb + labels: + app: cockroachdb +rules: +- apiGroups: + - certificates.k8s.io + resources: + - certificatesigningrequests + verbs: + - create + - get + - watch +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: cockroachdb + labels: + app: cockroachdb +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: cockroachdb +subjects: +- kind: ServiceAccount + name: cockroachdb + namespace: default +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: cockroachdb + labels: + app: cockroachdb +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cockroachdb +subjects: +- kind: ServiceAccount + name: cockroachdb + namespace: default +--- +apiVersion: v1 +kind: Service +metadata: + # This service is meant to be used by clients of the database. It exposes a ClusterIP that will + # automatically load balance connections to the different database pods. + name: cockroachdb-public + labels: + app: cockroachdb +spec: + ports: + # The main port, served by gRPC, serves Postgres-flavor SQL, internode + # traffic and the cli. + - port: 26257 + targetPort: 26257 + name: grpc + # The secondary port serves the UI as well as health and debug endpoints. + - port: 8080 + targetPort: 8080 + name: http + selector: + app: cockroachdb +--- +apiVersion: v1 +kind: Service +metadata: + # This service only exists to create DNS entries for each pod in the stateful + # set such that they can resolve each other's IP addresses. It does not + # create a load-balanced ClusterIP and should not be used directly by clients + # in most circumstances. + name: cockroachdb + labels: + app: cockroachdb + annotations: + # Use this annotation in addition to the actual publishNotReadyAddresses + # field below because the annotation will stop being respected soon but the + # field is broken in some versions of Kubernetes: + # https://github.com/kubernetes/kubernetes/issues/58662 + service.alpha.kubernetes.io/tolerate-unready-endpoints: "true" + # Enable automatic monitoring of all instances when Prometheus is running in the cluster. + prometheus.io/scrape: "true" + prometheus.io/path: "_status/vars" + prometheus.io/port: "8080" +spec: + ports: + - port: 26257 + targetPort: 26257 + name: grpc + - port: 8080 + targetPort: 8080 + name: http + # We want all pods in the StatefulSet to have their addresses published for + # the sake of the other CockroachDB pods even before they're ready, since they + # have to be able to talk to each other in order to become ready. + publishNotReadyAddresses: true + clusterIP: None + selector: + app: cockroachdb +--- +apiVersion: policy/v1beta1 +kind: PodDisruptionBudget +metadata: + name: cockroachdb-budget + labels: + app: cockroachdb +spec: + selector: + matchLabels: + app: cockroachdb + maxUnavailable: 1 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: cockroachdb + # TODO: Use this field to specify a namespace other than "default" in which to deploy CockroachDB (e.g., us-east-1). + # namespace: +spec: + serviceName: "cockroachdb" + replicas: 3 + selector: + matchLabels: + app: cockroachdb + template: + metadata: + labels: + app: cockroachdb + spec: + serviceAccountName: cockroachdb + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - cockroachdb + topologyKey: kubernetes.io/hostname + # This init container is used to determine the availability zones of the Cockroach pods. The AZs are used to define --locality when starting Cockroach nodes. + initContainers: + - command: + - sh + - -ecx + - echo "aws-$(curl http://169.254.169.254/latest/meta-data/placement/availability-zone/)" + > /etc/cockroach-env/zone + image: byrnedo/alpine-curl:0.1 + imagePullPolicy: IfNotPresent + name: locality-container + resources: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /etc/cockroach-env + name: cockroach-env + containers: + - name: cockroachdb + image: cockroachdb/cockroach:v20.1.3 + imagePullPolicy: IfNotPresent + # TODO: Change these to appropriate values for the hardware that you're running. You can see the amount of allocatable resources on each of your Kubernetes nodes by running: kubectl describe nodes + # resources: + # requests: + # cpu: "4Gi" + # memory: "4Gi" + # NOTE: Unless you have enabled the non-default Static CPU Management Policy and are using an integer number of CPUs, we don't recommend setting a CPU limit. See: + # https://kubernetes.io/docs/tasks/administer-cluster/cpu-management-policies/#static-policy + # https://github.com/kubernetes/kubernetes/issues/51135 + # limits: + # cpu: "16" + # memory: "4Gi" + ports: + - containerPort: 26257 + name: grpc + - containerPort: 8080 + name: http + livenessProbe: + httpGet: + path: "/health" + port: http + scheme: HTTPS + initialDelaySeconds: 30 + periodSeconds: 5 + readinessProbe: + httpGet: + path: "/health?ready=1" + port: http + scheme: HTTPS + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 2 + volumeMounts: + - name: datadir + mountPath: /cockroach/cockroach-data + - name: certs + mountPath: /cockroach/cockroach-certs + - name: cockroach-env + mountPath: /etc/cockroach-env + env: + - name: COCKROACH_CHANNEL + value: kubernetes-multiregion + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + resource: limits.cpu + divisor: "1" + - name: MEMORY_LIMIT_MIB + valueFrom: + resourceFieldRef: + resource: limits.memory + divisor: "1Mi" + command: + - "/bin/bash" + - "-ecx" + # The use of qualified `hostname -f` is crucial: + # Other nodes aren't able to look up the unqualified hostname. + - exec + /cockroach/cockroach + start + --logtostderr + --certs-dir /cockroach/cockroach-certs + --advertise-host $(hostname -f) + --http-addr 0.0.0.0 + # TODO: Replace the placeholder values in --join and --locality with the namespace of the CockroachDB cluster in each region (e.g., us-east-1). + # --join cockroachdb-0.cockroachdb.,cockroachdb-1.cockroachdb.,cockroachdb-2.cockroachdb.,cockroachdb-0.cockroachdb.,cockroachdb-1.cockroachdb.,cockroachdb-2.cockroachdb.,cockroachdb-0.cockroachdb.,cockroachdb-1.cockroachdb.,cockroachdb-2.cockroachdb. + # --locality=region=,az=$(cat /etc/cockroach-env/zone),dns=$(hostname -f) + --cache $(expr $MEMORY_LIMIT_MIB / 4)MiB + --max-sql-memory $(expr $MEMORY_LIMIT_MIB / 4)MiB + # No pre-stop hook is required, a SIGTERM plus some time is all that's + # needed for graceful shutdown of a node. + terminationGracePeriodSeconds: 60 + volumes: + - name: datadir + persistentVolumeClaim: + claimName: datadir + - name: certs + secret: + secretName: cockroachdb.node + defaultMode: 256 + - name: cockroach-env + emptyDir: {} + podManagementPolicy: Parallel + updateStrategy: + type: RollingUpdate + volumeClaimTemplates: + - metadata: + name: datadir + spec: + accessModes: + - "ReadWriteOnce" + resources: + requests: + storage: 100Gi From 29f88d836f3370cad1fca7fdeae7caf4e3c80b5e Mon Sep 17 00:00:00 2001 From: taroface Date: Wed, 22 Jul 2020 15:27:55 -0400 Subject: [PATCH 2/7] add NLB manifest for EKS --- cloud/kubernetes/multiregion/dns-lb-eks.yaml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 cloud/kubernetes/multiregion/dns-lb-eks.yaml diff --git a/cloud/kubernetes/multiregion/dns-lb-eks.yaml b/cloud/kubernetes/multiregion/dns-lb-eks.yaml new file mode 100644 index 000000000000..773460299d49 --- /dev/null +++ b/cloud/kubernetes/multiregion/dns-lb-eks.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + k8s-app: kube-dns + name: cockroachdb-dns-external + namespace: kube-system + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: "nlb" +spec: + ports: + - name: dns + port: 53 + protocol: TCP + targetPort: 53 + selector: + k8s-app: kube-dns + type: LoadBalancer + loadBalancerSourceRanges: ["0.0.0.0/0"] \ No newline at end of file From 6180d94758de8c61c9dcaf36d365e2da6a9cf220 Mon Sep 17 00:00:00 2001 From: taroface Date: Wed, 22 Jul 2020 18:07:17 -0400 Subject: [PATCH 3/7] update placeholder CPU and memory requests/limits --- .../multiregion/cockroachdb-statefulset-secure-eks.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cloud/kubernetes/multiregion/cockroachdb-statefulset-secure-eks.yaml b/cloud/kubernetes/multiregion/cockroachdb-statefulset-secure-eks.yaml index 519045733ec0..529b11e72c44 100644 --- a/cloud/kubernetes/multiregion/cockroachdb-statefulset-secure-eks.yaml +++ b/cloud/kubernetes/multiregion/cockroachdb-statefulset-secure-eks.yaml @@ -189,14 +189,14 @@ spec: # TODO: Change these to appropriate values for the hardware that you're running. You can see the amount of allocatable resources on each of your Kubernetes nodes by running: kubectl describe nodes # resources: # requests: - # cpu: "4Gi" - # memory: "4Gi" + # cpu: "16" + # memory: "8Gi" # NOTE: Unless you have enabled the non-default Static CPU Management Policy and are using an integer number of CPUs, we don't recommend setting a CPU limit. See: # https://kubernetes.io/docs/tasks/administer-cluster/cpu-management-policies/#static-policy # https://github.com/kubernetes/kubernetes/issues/51135 # limits: # cpu: "16" - # memory: "4Gi" + # memory: "8Gi" ports: - containerPort: 26257 name: grpc From 44ab542827a5b943b456fbe323bfd48d03ccbde4 Mon Sep 17 00:00:00 2001 From: taroface Date: Wed, 22 Jul 2020 19:29:24 -0400 Subject: [PATCH 4/7] cloud: add modifiable ConfigMap for multi-region EKS --- cloud/kubernetes/multiregion/configmap.yaml | 41 +++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 cloud/kubernetes/multiregion/configmap.yaml diff --git a/cloud/kubernetes/multiregion/configmap.yaml b/cloud/kubernetes/multiregion/configmap.yaml new file mode 100644 index 000000000000..cb276be2efa0 --- /dev/null +++ b/cloud/kubernetes/multiregion/configmap.yaml @@ -0,0 +1,41 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: coredns + namespace: kube-system +data: + Corefile: | + .:53 { + errors + ready + health + kubernetes cluster.local in-addr.arpa ip6.arpa { + pods insecure + upstream + fallthrough in-addr.arpa ip6.arpa + } + prometheus :9153 + forward . /etc/resolv.conf + cache 10 + loop + reload + loadbalance + } + .svc.cluster.local:53 { # <---- Modify + log + errors + ready + cache 10 + forward . { # <---- Modify + force_tcp # <---- Modify + } + } + .svc.cluster.local:53 { # <---- Modify + log + errors + ready + cache 10 + forward . { # <---- Modify + force_tcp # <---- Modify + } + } From b3566a06ee5009f82d0098494ac977e3417f4aeb Mon Sep 17 00:00:00 2001 From: taroface Date: Thu, 13 Aug 2020 22:59:15 -0400 Subject: [PATCH 5/7] add EKS configs README, relocate configs --- cloud/kubernetes/multiregion/README.md | 4 +- cloud/kubernetes/multiregion/eks/README.md | 89 +++++++++++++++++++ .../cockroachdb-statefulset-secure-eks.yaml | 0 .../multiregion/{ => eks}/configmap.yaml | 0 .../multiregion/{ => eks}/dns-lb-eks.yaml | 0 5 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 cloud/kubernetes/multiregion/eks/README.md rename cloud/kubernetes/multiregion/{ => eks}/cockroachdb-statefulset-secure-eks.yaml (100%) rename cloud/kubernetes/multiregion/{ => eks}/configmap.yaml (100%) rename cloud/kubernetes/multiregion/{ => eks}/dns-lb-eks.yaml (100%) diff --git a/cloud/kubernetes/multiregion/README.md b/cloud/kubernetes/multiregion/README.md index 3c8dcb055ed0..57b1fd0b9b90 100644 --- a/cloud/kubernetes/multiregion/README.md +++ b/cloud/kubernetes/multiregion/README.md @@ -1,8 +1,8 @@ -# Running CockroachDB across multiple Kubernetes clusters +# Running CockroachDB across multiple Kubernetes clusters (GKE) The script and configuration files in this directory enable deploying CockroachDB across multiple Kubernetes clusters that are spread across different -geographic regions. It deploys a CockroachDB +geographic regions and hosted on [GKE](https://cloud.google.com/kubernetes-engine). It deploys a CockroachDB [StatefulSet](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/) into each separate cluster, and links them together using DNS. diff --git a/cloud/kubernetes/multiregion/eks/README.md b/cloud/kubernetes/multiregion/eks/README.md new file mode 100644 index 000000000000..1348cf5770a4 --- /dev/null +++ b/cloud/kubernetes/multiregion/eks/README.md @@ -0,0 +1,89 @@ +# Running CockroachDB across multiple Kubernetes clusters (EKS) + +The configuration files in this directory enable a multi-region CockroachDB deployment on [Amazon EKS](https://aws.amazon.com/eks/), using multiple Kubernetes clusters in different geographic regions. They are primarily intended for use with our [Orchestrate CockroachDB Across Multiple Kubernetes Clusters](https://www.cockroachlabs.com/docs/stable/orchestrate-cockroachdb-with-kubernetes-multi-cluster.html#eks) tutorial, but can be modified for use with any multi-region CockroachDB deployment hosted on EKS. + +Note that a successful multi-region deployment also requires configuring your EC2 network for inter-region traffic, which is covered fully in our tutorial. + +## Usage + +The below assumes you have created a Kubernetes cluster in each region in which you want to deploy CockroachDB. + +Each of the 3 configuration files must be applied separately to each Kubernetes cluster. + +### Create StatefulSets + +[`cockroachdb-statefulset-secure-eks.yaml`](https://github.com/cockroachdb/cockroach/cloud/kubernetes/multiregion/eks/cockroachdb-statefulset-secure-eks.yaml) creates a StatefulSet that runs 3 CockroachDB pods in a single region. + +Because the multi-region deployment requires deploying CockroachDB to a separate Kubernetes cluster in each region, you need to customize and apply a separate version of this file to each region. + +Use the `namespace` field to specify a namespace other than `default` in which to run the CockroachDB pods. This should correspond to the region in which the Kubernetes cluster is deployed (e.g., `us-east-1`). + +``` +namespace: +``` + +Also create the namespace in the appropriate region by running `kubectl create namespace --context=`. + +Change the resource `requests` and `limits` to appropriate values for the hardware that you're running. You can see the allocatable resources on each of your Kubernetes nodes by running `kubectl describe nodes`. + +``` +resources: + requests: + cpu: "16" + memory: "8Gi" + limits: + memory: "8Gi" +``` + +Replace the placeholder values in the `--join` and `--locality` flags with the namespace of the CockroachDB cluster in each region (e.g., `us-east-1`). `--join` specifies the host addresses that connect nodes to the cluster and distribute the rest of the node addresses. `--locality` describes the location of each CockroachDB node. + +``` +--join cockroachdb-0.cockroachdb.,cockroachdb-1.cockroachdb.,cockroachdb-2.cockroachdb.,cockroachdb-0.cockroachdb.,cockroachdb-1.cockroachdb.,cockroachdb-2.cockroachdb.,cockroachdb-0.cockroachdb.,cockroachdb-1.cockroachdb.,cockroachdb-2.cockroachdb. +--locality=region=,az=$(cat /etc/cockroach-env/zone),dns=$(hostname -f) +``` + +You can then deploy the StatefulSet in each region, specifying the appropriate cluster context and namespace (which you defined above): + +``` +kubectl create -f --context= --namespace= +``` + +Before initializing the cluster, however, you must enable CockroachDB pods to communicate across regions. This includes peering the VPCs in all 3 regions with each other, setting up a [Network Load Balancer](#set-up-load-balancing) in each region, and [configuring a CoreDNS service](#configure-coredns) to route DNS traffic to the appropriate pods. For information on configuring the EC2 network, see our [documentation](https://www.cockroachlabs.com/docs/stable/orchestrate-cockroachdb-with-kubernetes-multi-cluster.html#eks). + +### Set up load balancing + +[`dns-lb-eks.yaml`](https://github.com/cockroachdb/cockroach/cloud/kubernetes/multiregion/eks/dns-lb-eks.yaml) creates a [Network Load Balancer](https://docs.aws.amazon.com/elasticloadbalancing/latest/network/introduction.html) pointed at the CoreDNS service that routes DNS traffic to the appropriate pods. + +Upload the load balancer manifest to each region: + +``` +kubectl create -f https://raw.githubusercontent.com/cockroachdb/cockroach/master/cloud/kubernetes/multiregion/eks/dns-lb-eks.yaml --context= +``` + +### Configure CoreDNS + +[`configmap.yaml`](https://github.com/cockroachdb/cockroach/cloud/kubernetes/multiregion/eks/configmap.yaml) is a template for [modifying the ConfigMap](https://kubernetes.io/docs/tasks/administer-cluster/dns-custom-nameservers/#coredns-configmap-options) for the CoreDNS Corefile in each region. + +You must define a separate ConfigMap for each region. Each unique ConfigMap lists the forwarding addresses for the pods in the 2 other regions. + +For each region, replace: + +- `region2` and `region3` with the namespaces in which the CockroachDB pods will run in the other 2 regions. + +- `ip1`, `ip2`, and `ip3` with the IP addresses of the EKS instances in the region. + +First back up the existing ConfigMap in each region: + +``` +kubectl -n kube-system get configmap coredns -o yaml > +``` + +Then apply the new ConfigMap: + +``` +kubectl apply -f --context= +``` + +## More information + +For more information on running CockroachDB in Kubernetes, please see the [README in the parent directory](../../README.md). diff --git a/cloud/kubernetes/multiregion/cockroachdb-statefulset-secure-eks.yaml b/cloud/kubernetes/multiregion/eks/cockroachdb-statefulset-secure-eks.yaml similarity index 100% rename from cloud/kubernetes/multiregion/cockroachdb-statefulset-secure-eks.yaml rename to cloud/kubernetes/multiregion/eks/cockroachdb-statefulset-secure-eks.yaml diff --git a/cloud/kubernetes/multiregion/configmap.yaml b/cloud/kubernetes/multiregion/eks/configmap.yaml similarity index 100% rename from cloud/kubernetes/multiregion/configmap.yaml rename to cloud/kubernetes/multiregion/eks/configmap.yaml diff --git a/cloud/kubernetes/multiregion/dns-lb-eks.yaml b/cloud/kubernetes/multiregion/eks/dns-lb-eks.yaml similarity index 100% rename from cloud/kubernetes/multiregion/dns-lb-eks.yaml rename to cloud/kubernetes/multiregion/eks/dns-lb-eks.yaml From 4474d832b1542a471786839c324f37808eacf5ee Mon Sep 17 00:00:00 2001 From: taroface Date: Thu, 20 Aug 2020 11:09:41 -0400 Subject: [PATCH 6/7] cloud: fix EKS configs README --- cloud/kubernetes/multiregion/eks/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/kubernetes/multiregion/eks/README.md b/cloud/kubernetes/multiregion/eks/README.md index 1348cf5770a4..986212309c5e 100644 --- a/cloud/kubernetes/multiregion/eks/README.md +++ b/cloud/kubernetes/multiregion/eks/README.md @@ -70,7 +70,7 @@ For each region, replace: - `region2` and `region3` with the namespaces in which the CockroachDB pods will run in the other 2 regions. -- `ip1`, `ip2`, and `ip3` with the IP addresses of the EKS instances in the region. +- `ip1`, `ip2`, and `ip3` with the IP addresses of the Network Load Balancers in the region. First back up the existing ConfigMap in each region: From 8d54bcec1c858b9b6477d7f90bc277253c1d2145 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Fri, 14 Aug 2020 16:00:46 -0400 Subject: [PATCH 7/7] kvclient: don't spin in the DistSender trying the same replica over and over This patch addresses a scenario where a lease indicates a replica that, when contacted, claims to not have the lease and instead returns an older lease. In this scenario, the DistSender detects the fact that the node returned an old lease (which means that it's not aware of the new lease that it has acquired - for example because it hasn't applied it yet whereas other replicas have) and retries the same replica (with a backoff). Before this patch, the DistSender would retry the replica ad infinitum, hoping that it'll eventually become aware of its new lease. However, it's possible that the replica never finds out about this new lease (or, at least, not until the lease expires and a new leaseholder steps up). This could happen if the a replica acquires a lease but gets partitioned from all the other replicas before applying it. This patch puts a bound on the number of times the DistSender will retry the same replica in a row before moving on to others. Release note: None --- pkg/kv/kvclient/kvcoord/dist_sender.go | 26 +++++- pkg/kv/kvclient/kvcoord/dist_sender_test.go | 99 +++++++++++++++++++++ 2 files changed, 123 insertions(+), 2 deletions(-) diff --git a/pkg/kv/kvclient/kvcoord/dist_sender.go b/pkg/kv/kvclient/kvcoord/dist_sender.go index f183d212b72d..7a05c6f8474f 100644 --- a/pkg/kv/kvclient/kvcoord/dist_sender.go +++ b/pkg/kv/kvclient/kvcoord/dist_sender.go @@ -137,6 +137,9 @@ const ( defaultSenderConcurrency = 500 // The maximum number of range descriptors to prefetch during range lookups. rangeLookupPrefetchCount = 8 + // The maximum number of times a replica is retried when it repeatedly returns + // stale lease info. + sameReplicaRetryLimit = 10 ) var rangeDescriptorCacheSize = settings.RegisterIntSetting( @@ -1770,6 +1773,8 @@ func (ds *DistSender) sendToReplicas( // lease transfer is suspected. inTransferRetry := retry.StartWithCtx(ctx, ds.rpcRetryOptions) inTransferRetry.Next() // The first call to Next does not block. + var sameReplicaRetries int + var prevReplica roachpb.ReplicaDescriptor // This loop will retry operations that fail with errors that reflect // per-replica state and may succeed on other replicas. @@ -1805,7 +1810,13 @@ func (ds *DistSender) sendToReplicas( } } else { log.VEventf(ctx, 2, "trying next peer %s", curReplica.String()) + if prevReplica == curReplica { + sameReplicaRetries++ + } else { + sameReplicaRetries = 0 + } } + prevReplica = curReplica // Communicate to the server the information our cache has about the range. // If it's stale, the serve will return an update. ba.ClientRangeInfo = &roachpb.ClientRangeInfo{ @@ -1948,9 +1959,20 @@ func (ds *DistSender) sendToReplicas( routing = routing.UpdateLeaseholder(ctx, *tErr.LeaseHolder) ok = true } - // Move the new lease holder to the head of the queue for the next retry. + // Move the new leaseholder to the head of the queue for the next + // retry. Note that the leaseholder might not be the one indicated by + // the NLHE we just received, in case that error carried stale info. if lh := routing.Leaseholder(); lh != nil { - transport.MoveToFront(*lh) + // If the leaseholder is the replica that we've just tried, and + // we've tried this replica a bunch of times already, let's move on + // and not try it again. This prevents us getting stuck on a replica + // that we think has the lease but keeps returning redirects to us + // (possibly because it hasn't applied its lease yet). Perhaps that + // lease expires and someone else gets a new one, so by moving on we + // get out of possibly infinite loops. + if *lh != curReplica || sameReplicaRetries < sameReplicaRetryLimit { + transport.MoveToFront(*lh) + } } // See if we want to backoff a little before the next attempt. If the lease info // we got is stale, we backoff because it might be the case that there's a diff --git a/pkg/kv/kvclient/kvcoord/dist_sender_test.go b/pkg/kv/kvclient/kvcoord/dist_sender_test.go index 1c631d425b04..67bc4cf8797d 100644 --- a/pkg/kv/kvclient/kvcoord/dist_sender_test.go +++ b/pkg/kv/kvclient/kvcoord/dist_sender_test.go @@ -34,6 +34,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/rpc/nodedialer" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/testutils" + "github.com/cockroachdb/cockroach/pkg/testutils/skip" "github.com/cockroachdb/cockroach/pkg/util" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/leaktest" @@ -792,6 +793,104 @@ func TestBackoffOnNotLeaseHolderErrorDuringTransfer(t *testing.T) { } } +// Test a scenario where a lease indicates a replica that, when contacted, +// claims to not have the lease and instead returns an older lease. In this +// scenario, the DistSender detects the fact that the node returned an old lease +// (which means that it's not aware of the new lease that it has acquired - for +// example because it hasn't applied it yet whereas other replicas have) and +// retries the same replica (with a backoff). We don't want the DistSender to do +// this ad infinitum, in case the respective replica never becomes aware of its +// new lease. Eventually that lease will expire and someone else can get it, but +// if the DistSender would just spin forever on this replica it will never find +// out about it. This could happen if the a replica acquires a lease but gets +// partitioned from all the other replicas before applying it. +// The DistSender is supposed to spin a few times and then move on to other +// replicas. +func TestDistSenderMovesOnFromReplicaWithStaleLease(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + // This test does many retries in the DistSender for contacting a replica, + // which run into DistSender's backoff policy. + skip.UnderShort(t) + ctx := context.Background() + stopper := stop.NewStopper() + defer stopper.Stop(ctx) + + clock := hlc.NewClock(hlc.UnixNano, time.Nanosecond) + rpcContext := rpc.NewInsecureTestingContext(clock, stopper) + g := makeGossip(t, stopper, rpcContext) + for _, n := range testUserRangeDescriptor3Replicas.Replicas().Voters() { + require.NoError(t, g.AddInfoProto( + gossip.MakeNodeIDKey(n.NodeID), + newNodeDesc(n.NodeID), + gossip.NodeDescriptorTTL, + )) + } + + desc := roachpb.RangeDescriptor{ + RangeID: 1, + Generation: 1, + StartKey: roachpb.RKeyMin, + EndKey: roachpb.RKeyMax, + InternalReplicas: []roachpb.ReplicaDescriptor{ + {NodeID: 1, StoreID: 1, ReplicaID: 1}, + {NodeID: 2, StoreID: 2, ReplicaID: 2}, + }, + } + staleLease := roachpb.Lease{ + Replica: desc.InternalReplicas[0], + Sequence: 1, + } + cachedLease := roachpb.Lease{ + Replica: desc.InternalReplicas[1], + Sequence: 2, + } + + // The cache starts with a lease on node 2, so the first request will be + // routed there. That replica will reply with an older lease, prompting the + // DistSender to try it again. Eventually the DistSender will try the other + // replica, which will return a success. + + var callsToNode2 int + sendFn := func(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, error) { + if ba.Replica.NodeID == 2 { + callsToNode2++ + reply := &roachpb.BatchResponse{} + err := &roachpb.NotLeaseHolderError{Lease: &staleLease} + reply.Error = roachpb.NewError(err) + return reply, nil + } + require.Equal(t, ba.Replica.NodeID, roachpb.NodeID(1)) + return ba.CreateReply(), nil + } + + cfg := DistSenderConfig{ + AmbientCtx: log.AmbientContext{Tracer: tracing.NewTracer()}, + Clock: clock, + NodeDescs: g, + RPCContext: rpcContext, + TestingKnobs: ClientTestingKnobs{ + TransportFactory: adaptSimpleTransport(sendFn), + }, + RangeDescriptorDB: threeReplicaMockRangeDescriptorDB, + NodeDialer: nodedialer.New(rpcContext, gossip.AddressResolver(g)), + Settings: cluster.MakeTestingClusterSettings(), + } + ds := NewDistSender(cfg) + + ds.rangeCache.Insert(ctx, roachpb.RangeInfo{ + Desc: desc, + Lease: cachedLease, + }) + + get := roachpb.NewGet(roachpb.Key("a")) + _, pErr := kv.SendWrapped(ctx, ds, get) + require.Nil(t, pErr) + + require.Greater(t, callsToNode2, 0) + require.LessOrEqual(t, callsToNode2, 11) +} + // This test verifies that when we have a cached leaseholder that is down // it is ejected from the cache. func TestDistSenderDownNodeEvictLeaseholder(t *testing.T) {