Skip to content

Commit

Permalink
disk_Check: Scan & mount as RW when disk turns into Read-only (sonic-…
Browse files Browse the repository at this point in the history
…net#1872)

* disk_check: Check & mount RO as RW using tmpfs (sonic-net#1569)

What I did
There is a bug that occasionally turn root-overlay as RO. This makes /etc & /home as RO. This blocks any new remote user login, as that needs to write into /etc & /home.

This tool scans /etc & /home (or given dirs) as in RW or RO state. If RO, it could create a writable overlay using tmpfs.
This is transient and stays until next reboot. Any write after the overlay will be lost upon reboot.

But this allows new remote users login.

How I did it
Create upper & work dirs in /run/mount (tmpfs). Mount /etc & /home as lowerdirs and use the same name for final merge. This allows anyone opening a file in /etc or /home to operate on the merged overlay, transparently.

How to verify it
Mount any dir on tmpfs ( mount -t tmpfs tmpfs test_dir)
remount as RO (mount -o remount,ro test_dir)
Pass that dir to this script. (disk_check.py -d ./test_dir)
Now it should be RW
  • Loading branch information
renukamanavalan authored Nov 20, 2021
1 parent c80321c commit e648290
Show file tree
Hide file tree
Showing 3 changed files with 361 additions and 0 deletions.
180 changes: 180 additions & 0 deletions scripts/disk_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
What:
There have been cases, where disk turns Read-only due to kernel bug.
In Read-only state, system blocks new remote user login via TACACS.
This utility is to check & make transient recovery as needed.
How:
check for Read-Write permission. If Read-only, create writable overlay using tmpfs.
By default "/etc" & "/home" are checked and if in Read-only state, make them Read-Write
using overlay on top of tmpfs.
Making /etc & /home as writable lets successful new remote user login.
If in Read-only state or in Read-Write state with the help of tmpfs overlay,
syslog ERR messages are written, to help raise alerts.
Monit may be used to invoke it periodically, to help scan & fix and
report via syslog.
Tidbit:
If you would like to test this script, you could simulate a RO disk
with the following command. Reboot will revert the effect.
sudo bash -c "echo u > /proc/sysrq-trigger"
"""

import argparse
import os
import sys
import syslog
import subprocess

UPPER_DIR = "/run/mount/upper"
WORK_DIR = "/run/mount/work"
MOUNTS_FILE = "/proc/mounts"

chk_log_level = syslog.LOG_ERR

def _log_msg(lvl, pfx, msg):
if lvl <= chk_log_level:
print("{}: {}".format(pfx, msg))
syslog.syslog(lvl, msg)

def log_err(m):
_log_msg(syslog.LOG_ERR, "Err", m)


def log_info(m):
_log_msg(syslog.LOG_INFO, "Info", m)


def log_debug(m):
_log_msg(syslog.LOG_DEBUG, "Debug", m)


def test_writable(dirs):
for d in dirs:
rw = os.access(d, os.W_OK)
if not rw:
log_err("{} is not read-write".format(d))
return False
else:
log_debug("{} is Read-Write".format(d))
return True


def run_cmd(cmd):
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
(output, err) = p.communicate()
## Wait for end of command. Get return returncode ##
ret = p.wait()
if ret:
log_err("failed: ret={} cmd={}".format(ret, cmd))
else:
log_info("ret={} cmd: {}".format(ret, cmd))

if output:
log_info("stdout: {}".format(output.decode("utf-8")))
if err:
log_info("stderr: {}".format(err.decode("utf-8")))

return ret


def get_dname(path_name):
return os.path.basename(os.path.normpath(path_name))


def do_mnt(dirs):
if os.path.exists(UPPER_DIR):
log_err("Already mounted")
return 1

for i in (UPPER_DIR, WORK_DIR):
try:
os.mkdir(i)
except OSError as error:
log_err("Failed to create {}".format(i))
return 1

for d in dirs:
d_name = get_dname(d)
d_upper = os.path.join(UPPER_DIR, d_name)
d_work = os.path.join(WORK_DIR, d_name)
os.mkdir(d_upper)
os.mkdir(d_work)

ret = run_cmd("mount -t overlay overlay_{} -o lowerdir={},"
"upperdir={},workdir={} {}".format(
d_name, d, d_upper, d_work, d))
if ret:
break

if ret:
for i in (UPPER_DIR, WORK_DIR):
if os.path.exists(i):
ret = run_cmd("rm -rf {}".format(i))
if ret:
log_err("Failed to remove {}".format(i))

log_err("Failed to mount {} as Read-Write".format(dirs))
else:
log_info("{} are mounted as Read-Write".format(dirs))
return ret


def is_mounted(dirs):
if not os.path.exists(UPPER_DIR):
return False

onames = set()
for d in dirs:
onames.add("overlay_{}".format(get_dname(d)))

with open(MOUNTS_FILE, "r") as s:
for ln in s.readlines():
n = ln.strip().split()[0]
if n in onames:
log_debug("Mount exists for {}".format(n))
return True
return False


def do_check(skip_mount, dirs):
ret = 0
if not test_writable(dirs):
if not skip_mount:
ret = do_mnt(dirs)

# Check if mounted
if (not ret) and is_mounted(dirs):
log_err("READ-ONLY: Mounted {} to make Read-Write".format(dirs))

return ret


def main():
global chk_log_level

parser=argparse.ArgumentParser(
description="check disk for Read-Write and mount etc & home as Read-Write")
parser.add_argument('-s', "--skip-mount", action='store_true', default=False,
help="Skip mounting /etc & /home as Read-Write")
parser.add_argument('-d', "--dirs", default="/etc,/home",
help="dirs to mount")
parser.add_argument('-l', "--loglvl", default=syslog.LOG_ERR, type=int,
help="log level")
args = parser.parse_args()

chk_log_level = args.loglvl
ret = do_check(args.skip_mount, args.dirs.split(","))
return ret


if __name__ == "__main__":
sys.exit(main())
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
'scripts/db_migrator.py',
'scripts/decode-syseeprom',
'scripts/dropcheck',
'scripts/disk_check.py',
'scripts/dropconfig',
'scripts/dropstat',
'scripts/dump_nat_entries.py',
Expand Down
180 changes: 180 additions & 0 deletions tests/disk_check_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
import sys
import syslog
from unittest.mock import patch
import pytest
import subprocess

sys.path.append("scripts")
import disk_check

disk_check.MOUNTS_FILE = "/tmp/proc_mounts"

test_data = {
"0": {
"desc": "All good as /tmp is read-write",
"args": ["", "-d", "/tmp"],
"err": ""
},
"1": {
"desc": "Not good as /tmpx is not read-write; But fix skipped",
"args": ["", "-d", "/tmpx", "-s"],
"err": "/tmpx is not read-write"
},
"2": {
"desc": "Not good as /tmpx is not read-write; expect mount",
"args": ["", "-d", "/tmpx"],
"upperdir": "/tmp/tmpx",
"workdir": "/tmp/tmpy",
"mounts": "overlay_tmpx blahblah",
"err": "/tmpx is not read-write|READ-ONLY: Mounted ['/tmpx'] to make Read-Write",
"cmds": ['mount -t overlay overlay_tmpx -o lowerdir=/tmpx,upperdir=/tmp/tmpx/tmpx,workdir=/tmp/tmpy/tmpx /tmpx']
},
"3": {
"desc": "Not good as /tmpx is not read-write; mount fail as create of upper fails",
"args": ["", "-d", "/tmpx"],
"upperdir": "/tmpx",
"expect_ret": 1
},
"4": {
"desc": "Not good as /tmpx is not read-write; mount fail as upper exist",
"args": ["", "-d", "/tmpx"],
"upperdir": "/tmp",
"err": "/tmpx is not read-write|Already mounted",
"expect_ret": 1
},
"5": {
"desc": "/tmp is read-write, but as well mount exists; hence report",
"args": ["", "-d", "/tmp"],
"upperdir": "/tmp",
"mounts": "overlay_tmp blahblah",
"err": "READ-ONLY: Mounted ['/tmp'] to make Read-Write"
},
"6": {
"desc": "Test another code path for good case",
"args": ["", "-d", "/tmp"],
"upperdir": "/tmp"
}
}

err_data = ""
max_log_lvl = -1
cmds = []
current_tc = None

def mount_file(d):
with open(disk_check.MOUNTS_FILE, "w") as s:
s.write(d)


def report_err_msg(lvl, m):
global err_data
global max_log_lvl

if lvl > max_log_lvl:
max_log_lvl = lvl

if lvl == syslog.LOG_ERR:
if err_data:
err_data += "|"
err_data += m


class proc:
returncode = 0
stdout = None
stderr = None

def __init__(self, proc_upd = None):
if proc_upd:
self.returncode = proc_upd.get("ret", 0)
self.stdout = proc_upd.get("stdout", None)
self.stderr = proc_upd.get("stderr", None)


def mock_subproc_run(cmd, shell, stdout):
global cmds

assert shell == True
assert stdout == subprocess.PIPE

upd = (current_tc["proc"][len(cmds)]
if len(current_tc.get("proc", [])) > len(cmds) else None)
cmds.append(cmd)

return proc(upd)


def init_tc(tc):
global err_data, cmds, current_tc

err_data = ""
cmds = []
mount_file(tc.get("mounts", ""))
current_tc = tc


def swap_upper(tc):
tmp_u = tc["upperdir"]
tc["upperdir"] = disk_check.UPPER_DIR
disk_check.UPPER_DIR = tmp_u


def swap_work(tc):
tmp_w = tc["workdir"]
tc["upperdir"] = disk_check.WORK_DIR
disk_check.WORK_DIR = tmp_w


class TestDiskCheck(object):
def setup(self):
pass


@patch("disk_check.syslog.syslog")
@patch("disk_check.subprocess.run")
def test_readonly(self, mock_proc, mock_log):
global err_data, cmds, max_log_lvl

mock_proc.side_effect = mock_subproc_run
mock_log.side_effect = report_err_msg

with patch('sys.argv', ["", "-l", "7", "-d", "/tmp"]):
disk_check.main()
assert max_log_lvl == syslog.LOG_DEBUG
max_log_lvl = -1

for i, tc in test_data.items():
print("-----------Start tc {}---------".format(i))
init_tc(tc)

with patch('sys.argv', tc["args"]):
if "upperdir" in tc:
swap_upper(tc)

if "workdir" in tc:
# restore
swap_work(tc)

ret = disk_check.main()

if "upperdir" in tc:
# restore
swap_upper(tc)

if "workdir" in tc:
# restore
swap_work(tc)

print("ret = {}".format(ret))
print("err_data={}".format(err_data))
print("cmds: {}".format(cmds))

assert ret == tc.get("expect_ret", 0)
if "err" in tc:
assert err_data == tc["err"]
assert cmds == tc.get("cmds", [])
print("-----------End tc {}-----------".format(i))


assert max_log_lvl == syslog.LOG_ERR

0 comments on commit e648290

Please sign in to comment.