From 81346b81c991db1a327b84228d2031394f9deb46 Mon Sep 17 00:00:00 2001
From: Travis Nielsen <tnielsen@redhat.com>
Date: Fri, 15 Nov 2019 11:32:56 -0700
Subject: [PATCH] ceph: stop osd process more quickly during pod shutdown

The OSD needs to shut down quickly during upgrade, or other scenarios
where the OSD is being restarted. To facilitate this fast shutdown,
rook will run kill -9 on the osd process. The Ceph OSD is designed
to be safe even when killed like this. This allows the
ECONNREFUSED to be returned sooner, which will redirect the OSD
traffic to other OSDs and cause less downtime.

Signed-off-by: Travis Nielsen <tnielsen@redhat.com>
---
 pkg/daemon/ceph/osd/daemon.go | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/pkg/daemon/ceph/osd/daemon.go b/pkg/daemon/ceph/osd/daemon.go
index dd131a490d575..51aa5d6a6c42f 100644
--- a/pkg/daemon/ceph/osd/daemon.go
+++ b/pkg/daemon/ceph/osd/daemon.go
@@ -120,8 +120,14 @@ func killCephOSDProcess(context *clusterd.Context, lvPath string) error {
 
 	// shut down the osd-ceph process so that lvm release does not show device in use error.
 	if pid != "" {
-		if err := context.Executor.ExecuteCommand(false, "", "kill", pid); err != nil {
-			return fmt.Errorf("failed to delete ceph-osd process. %+v", err)
+		// The OSD needs to exit as quickly as possible in order for the IO requests
+		// to be redirected to other OSDs in the cluster. The OSD is designed to tolerate failures
+		// of any kind, including power loss or kill -9. The upstream Ceph tests have for many years
+		// been testing with kill -9 so this is expected to be safe. There is a fix upstream Ceph that will
+		// improve the shutdown time of the OSD. For cleanliness we should consider removing the -9
+		// once it is backported to Nautilus: https://github.com/ceph/ceph/pull/31677.
+		if err := context.Executor.ExecuteCommand(false, "", "kill", "-9", pid); err != nil {
+			return fmt.Errorf("failed to kill ceph-osd process. %+v", err)
 		}
 	}