From 064401e4b03e18b5c165bcc905b28764c9495ad9 Mon Sep 17 00:00:00 2001
From: Alex Robinson <alexdwanerobinson@gmail.com>
Date: Mon, 24 Oct 2016 10:01:08 -0400
Subject: [PATCH] k8s: Fix potential edge case where a second cluster could get
 started

* Add an init container that checks for whether any other peers exist
* Re-enable the tolerate-unready-endpoints option. It's bprashanth's
  recommendation, and makes the init container less likely to miss
  anything.
* Switch from joining `cockroachdb` to joining `cockroachdb-public`.
  This is needed due to re-enabling `tolerate-undready-endpoints`.

I wish we could just directly re-use the peer-finder container without
having to wrap it, but I already burned too much time trying to get it
to accepts non-trivial commands in its `--on-start` parameter. Almost
everything I tried would just get treated to a "No such file or
directory" error.
---
 cloud/kubernetes/README.md               | 16 ++++-
 cloud/kubernetes/cockroachdb-petset.yaml | 85 ++++++++++++++++++------
 cloud/kubernetes/init/Dockerfile         |  6 ++
 cloud/kubernetes/init/README.md          | 30 +++++++++
 cloud/kubernetes/init/on-start.sh        |  8 +++
 5 files changed, 121 insertions(+), 24 deletions(-)
 create mode 100644 cloud/kubernetes/init/Dockerfile
 create mode 100644 cloud/kubernetes/init/README.md
 create mode 100755 cloud/kubernetes/init/on-start.sh

diff --git a/cloud/kubernetes/README.md b/cloud/kubernetes/README.md
index 48cf6d2badcd..573cedb2716a 100644
--- a/cloud/kubernetes/README.md
+++ b/cloud/kubernetes/README.md
@@ -98,8 +98,7 @@ Start up a client pod and open up an interactive, (mostly) Postgres-flavor
 SQL shell using:
 
 ```console
-$ kubectl run -it cockroach-client --image=cockroachdb/cockroach --restart=Never --command -- bash
-root@cockroach-client # ./cockroach sql --host cockroachdb-public
+$ kubectl run -it --rm cockroach-client --image=cockroachdb/cockroach --restart=Never --command -- ./cockroach sql --host cockroachdb-public
 ```
 
 You can see example SQL statements for inserting and querying data in the
@@ -107,6 +106,19 @@ included [demo script](demo.sh), but can use almost any Postgres-style SQL
 commands. Some more basic examples can be found within
 [CockroachDB's documentation](https://www.cockroachlabs.com/docs/learn-cockroachdb-sql.html).
 
+## Accessing the admin UI
+
+If you want to see information about how the cluster is doing, you can try
+pulling up the CockroachDB admin UI by port-forwarding from your local machine
+to one of the pods:
+
+```shell
+kubectl port-forward cockroachdb-0 8080
+```
+
+Once you’ve done that, you should be able to access the admin UI by visiting
+http://localhost:8080/ in your web browser.
+
 ## Simulating failures
 
 When all (or enough) nodes are up, simulate a failure like this:
diff --git a/cloud/kubernetes/cockroachdb-petset.yaml b/cloud/kubernetes/cockroachdb-petset.yaml
index cf3c285966c9..c7d5bf4fc536 100644
--- a/cloud/kubernetes/cockroachdb-petset.yaml
+++ b/cloud/kubernetes/cockroachdb-petset.yaml
@@ -23,17 +23,25 @@ spec:
 apiVersion: v1
 kind: Service
 metadata:
+  # This service only exists to create DNS entries for each pet in the petset
+  # such that they can resolve each other's IP addresses. It does not create a
+  # load-balanced ClusterIP and should not be used directly by clients in most
+  # circumstances.
+  name: cockroachdb
+  labels:
+    app: cockroachdb
   annotations:
+    # This is needed to make the peer-finder work properly and to help avoid
+    # edge cases where instance 0 comes up after losing its data and needs to
+    # decide whether it should create a new cluster or try to join an existing
+    # one. If it creates a new cluster when it should have joined an existing
+    # one, we'd end up with two separate clusters listening at the same service
+    # endpoint, which would be very bad.
+    service.alpha.kubernetes.io/tolerate-unready-endpoints: "true"
     # Enable automatic monitoring of all instances when Prometheus is running in the cluster.
     prometheus.io/scrape: "true"
     prometheus.io/path: "_status/vars"
     prometheus.io/port: "8080"
-  # This service only exists to create DNS entries for each pet in the petset such that they can resolve
-  # each other's IP addresses. It does not create a load-balanced ClusterIP and should not be used
-  # directly by clients in most circumstances.
-  name: cockroachdb
-  labels:
-    app: cockroachdb
 spec:
   ports:
   - port: 26257
@@ -59,6 +67,43 @@ spec:
         app: cockroachdb
       annotations:
         pod.alpha.kubernetes.io/initialized: "true"
+        # Init containers are run only once in the lifetime of a pod, before
+        # it's started up for the first time. It has to exit successfully
+        # before the pod's main containers are allowed to start.
+        # This particular init container does a DNS lookup for other pods in
+        # the petset to help determine whether or not a cluster already exists.
+        # If any other pets exist, it creates a file in the cockroach-data
+        # directory to pass that information along to the primary container that
+        # has to decide what command-line flags to use when starting CockroachDB.
+        # This only matters when a pod's persistent volume is empty - if it has
+        # data from a previous execution, that data will always be used.
+        pod.alpha.kubernetes.io/init-containers: '[
+            {
+                "name": "bootstrap",
+                "image": "cockroachdb/cockroach-k8s-init:0.1",
+                "args": [
+                  "-on-start=/on-start.sh",
+                  "-service=cockroachdb"
+                ],
+                "env": [
+                  {
+                      "name": "POD_NAMESPACE",
+                      "valueFrom": {
+                          "fieldRef": {
+                              "apiVersion": "v1",
+                              "fieldPath": "metadata.namespace"
+                          }
+                      }
+                   }
+                ],
+                "volumeMounts": [
+                    {
+                        "name": "datadir",
+                        "mountPath": "/cockroach/cockroach-data"
+                    }
+                ]
+            }
+        ]'
     spec:
       containers:
       - name: cockroachdb
@@ -94,24 +139,20 @@ spec:
             # The use of qualified `hostname -f` is crucial:
             # Other nodes aren't able to look up the unqualified hostname.
             CRARGS=("start" "--logtostderr" "--insecure" "--host" "$(hostname -f)" "--http-host" "0.0.0.0")
-            # TODO(tschottdorf): really want to use an init container to do
-            # the bootstrapping. The idea is that the container would know
-            # whether it's on the first node and could check whether there's
-            # already a data directory. If not, it would bootstrap the cluster.
-            # We will need some version of `cockroach init` back for this to
-            # work. For now, just do the same in a shell snippet.
-            # Of course this isn't without danger - if node0 loses its data,
-            # upon restarting it will simply bootstrap a new cluster and smack
-            # it into our existing cluster.
-            # There are likely ways out. For example, the init container could
-            # query the kubernetes API and see whether any other nodes are
-            # around, etc. Or, of course, the admin can pre-seed the lost
-            # volume somehow (and in that case we should provide a better way,
-            # for example a marker file).
+            # We only want to initialize a new cluster (by omitting the join flag)
+            # if we're sure that we're the first node (i.e. index 0) and that
+            # there aren't any other nodes running as part of the cluster that
+            # this is supposed to be a part of (which indicates that a cluster
+            # already exists and we should make sure not to create a new one).
+            # It's fine to run without --join on a restart if there aren't any
+            # other nodes.
             if [ ! "$(hostname)" == "cockroachdb-0" ] || \
-               [ -e "/cockroach/cockroach-data/COCKROACHDB_VERSION" ]
+               [ -e "/cockroach/cockroach-data/cluster_exists_marker" ]
             then
-              CRARGS+=("--join" "cockroachdb")
+              # We don't join cockroachdb in order to avoid a node attempting
+              # to join itself, which currently doesn't work
+              # (https://github.com/cockroachdb/cockroach/issues/9625).
+              CRARGS+=("--join" "cockroachdb-public")
             fi
             exec /cockroach/cockroach ${CRARGS[*]}
       # No pre-stop hook is required, a SIGTERM plus some time is all that's
diff --git a/cloud/kubernetes/init/Dockerfile b/cloud/kubernetes/init/Dockerfile
new file mode 100644
index 000000000000..4e204ffd9953
--- /dev/null
+++ b/cloud/kubernetes/init/Dockerfile
@@ -0,0 +1,6 @@
+FROM gcr.io/google_containers/peer-finder:0.1
+
+ADD on-start.sh /
+RUN chmod -c 755 /on-start.sh
+
+ENTRYPOINT ["/peer-finder"]
diff --git a/cloud/kubernetes/init/README.md b/cloud/kubernetes/init/README.md
new file mode 100644
index 000000000000..b5372add6a7e
--- /dev/null
+++ b/cloud/kubernetes/init/README.md
@@ -0,0 +1,30 @@
+# Overview
+
+The Dockerfile in this directory defines a lightweight wrapper around the
+[Kubernetes-maintained "peer-finder"
+image](https://github.com/kubernetes/contrib/tree/master/pets/peer-finder),
+which finds whether any other instances from the same PetSet currently exist in
+the cluster.
+
+The `on-start.sh` script in this directory is invoked by the peer-finder binary
+with a newline separated list of the DNS results matching the provided
+Kubernetes service name and namespace.
+
+We use this to try to help the first CockroachDB instance decide whether it
+should try to join an existing cluster or initialize a new one. We have to be
+very careful about initializing a new one, since doing so when one alread
+exists can cause some real problems.
+
+# Pushing a new version
+
+Assuming you're logged in to a Docker Hub account that can push to the
+cockroachdb organization, [check the latest tag of the
+cockroachdb/cockroach-k8s-init
+container](https://hub.docker.com/r/cockroachdb/cockroach-k8s-init/tags/) so
+that you know what tag number to use next, then cd to this directory and run:
+
+```shell
+NEW_TAG=0.0 # replace 0.0 with the next appropriate tag number
+docker build -t "cockroachdb/cockroach-k8s-init:${NEW_TAG}" .
+docker push "cockroachdb/cockroach-k8s-init:${NEW_TAG}"
+```
diff --git a/cloud/kubernetes/init/on-start.sh b/cloud/kubernetes/init/on-start.sh
new file mode 100755
index 000000000000..51e93c9f22a7
--- /dev/null
+++ b/cloud/kubernetes/init/on-start.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+# Simply determine if any instances exist other than this one. If there are any
+# others, then assume that a cluster already exists and create a marker to
+# signal that we shouldn't create a new one.
+if grep -v `hostname -f`; then
+  mkdir -p cockroach/cockroach-data && touch cockroach/cockroach-data/cluster_exists_marker
+fi