Skip to content

Commit

Permalink
🌱 Check disk before provisioning (bare-metal) (#1433)
Browse files Browse the repository at this point in the history
  • Loading branch information
guettli authored Aug 13, 2024
1 parent 08f2453 commit c291712
Show file tree
Hide file tree
Showing 8 changed files with 247 additions and 5 deletions.
5 changes: 3 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -182,12 +182,13 @@ deploy-controller: generate-manifests $(KUSTOMIZE) ## Deploy controller to the K
undeploy-controller: ## Undeploy controller from the K8s cluster specified in ~/.kube/config.
$(KUSTOMIZE) build config/default | $(KUBECTL) delete -f -

install-essentials: ## This gets the secret and installs a CNI and the CCM. Usage: MAKE install-essentials NAME=<cluster-name>
$(MAKE) wait-and-get-secret CLUSTER_NAME=$(NAME)
install-essentials: ## This gets the secret and installs a CNI and the CCM. Usage: MAKE install-essentials CLUSTER_NAME=<cluster-name>
$(MAKE) wait-and-get-secret CLUSTER_NAME=$(CLUSTER_NAME)
$(MAKE) install-cilium-in-wl-cluster
$(MAKE) install-ccm-in-wl-cluster

wait-and-get-secret:
./hack/ensure-env-variables.sh CLUSTER_NAME
# Wait for the kubeconfig to become available.
rm -f $(WORKER_CLUSTER_KUBECONFIG)
${TIMEOUT} --foreground 5m bash -c "while ! $(KUBECTL) get secrets | grep $(CLUSTER_NAME)-kubeconfig; do sleep 1; done"
Expand Down
2 changes: 2 additions & 0 deletions api/v1beta1/conditions_const.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ const (
SSHToRescueSystemFailedReason = "SSHToRescueSystemFailed"
// RebootTimedOutReason indicates that the reboot timed out.
RebootTimedOutReason = "RebootTimedOut"
// CheckDiskFailedReason indicates that checking the health of the disk was not successful.
CheckDiskFailedReason = "CheckDiskFailed"
)

const (
Expand Down
3 changes: 3 additions & 0 deletions api/v1beta1/hetznerbaremetalhost_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ const (
// WipeDiskAnnotation indicates which Disks (WWNs) to erase before provisioning
// The value is a list of WWNS or "all".
WipeDiskAnnotation = "wipedisk.hetznerbaremetalhost.infrastructure.cluster.x-k8s.io"

// IgnoreCheckDiskAnnotation indicates that the machine should get provisioned, even if CheckDisk fails.
IgnoreCheckDiskAnnotation = "ignore-cd.hetznerbaremetalhost.infrastructure.cluster.x-k8s.io" // max length is 63 chars.
)

// RootDeviceHints holds the hints for specifying the storage location
Expand Down
14 changes: 13 additions & 1 deletion hack/output-for-watch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,22 @@ print_heading events:

kubectl get events -A --sort-by=lastTimestamp | grep -vP 'LeaderElection' | tail -8

print_heading logs:
print_heading caph:

./hack/tail-controller-logs.sh

regex='^I\d\d\d\d|\
.*it may have already been deleted|\
.*WARNING: ignoring DaemonSet-managed Pods|\
.*failed to retrieve Spec.ProviderID|\
.*failed to patch Machine default
'
capi_logs=$(kubectl logs -n capi-system deployments/capi-controller-manager --since 7m | grep -vP "$(echo "$regex" | tr -d '\n')" | tail -5)
if [ -n "$capi_logs" ]; then
print_heading capi
echo "$capi_logs"
fi

echo

if [ $(kubectl get machine -l cluster.x-k8s.io/control-plane 2>/dev/null | wc -l) -eq 0 ]; then
Expand Down
57 changes: 57 additions & 0 deletions pkg/services/baremetal/client/mocks/ssh/Client.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

102 changes: 102 additions & 0 deletions pkg/services/baremetal/client/ssh/check-disk.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/bin/bash

# Copyright 2024 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

trap 'echo "ERROR: A command has failed. Exiting the script. Line was ($0:$LINENO): $(sed -n "${LINENO}p" "$0")"; exit 3' ERR
set -Eeuo pipefail

function usage() {
echo "$0 wwn1 [wwn2 ...]"
echo " Check given disks."
echo " Exit 0: Disks look good."
echo " Exit 1: Disks seem faulty."
echo " Exit 3: Some other error (like invalid WWN)"
echo "Existing WWNs:"
lsblk -oNAME,WWN | grep -vi loop || true
}

if [ $# -eq 0 ]; then
echo "Error: No WWN was provided."
echo
usage
exit 3
fi

install_smartmontools() {
if [[ -f /etc/os-release ]]; then
# shellcheck disable=SC1091
. /etc/os-release
case "$ID" in
debian | ubuntu)
sudo apt-get update -qq
sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq -o Dpkg::Progress-Fancy="0" smartmontools |
{ grep -vP '^(NEEDRESTART|Selecting previously unselected|.Reading database|Preparing to unpack|Unpacking|Setting up|Processing)' || true; }
;;
centos | rhel | fedora)
sudo yum install -y smartmontools
;;
opensuse | sles)
sudo zypper install --non-interactive smartmontools
;;
arch | manjaro)
sudo pacman -Sy --noconfirm smartmontools
;;
*)
echo "Unsupported distribution: $ID"
exit 1
;;
esac
else
echo "Cannot detect the operating system."
exit 1
fi
}

# In the rescue system smartctl is always available. This is just needed if the
# script gets executed by hand (outside caph)
if ! type smartctl >/dev/null 2>&1; then
echo "INFO: smartctl not installed yet. If possible, please provide smartmontools in your machine image."
install_smartmontools
fi

result=$(mktemp)
trap 'rm -f "$result"' EXIT

# Iterate over all input arguments
for wwn in "$@"; do
if ! lsblk -l -oWWN | grep -qFx "${wwn}"; then
echo "$wwn is not a WWN of this machine"
echo
usage
exit 3
fi
device=$(lsblk -oNAME,WWN,TYPE | grep disk | grep "$wwn" | cut -d' ' -f1)
if [ -z "$device" ]; then
echo "Failed to find device for WWN $wwn"
exit 3
fi
smartctl -H "/dev/$device" | { grep -vP '^(smartctl \d+\.\d+.*|Copyright|=+ START OF)' || true; } |
{ grep -v '^$' || true; } |
sed "s#^#$wwn (/dev/$device): #" >>"$result"
done
errors=$(grep -v PASSED "$result" || true)
if [ -n "$errors" ]; then
#some lines don't contain "PASSED". There was an error.
echo "check-disk failed!"
echo "$errors"
exit 1
fi
cat "$result"
exit 0
40 changes: 38 additions & 2 deletions pkg/services/baremetal/client/ssh/ssh_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ var detectLinuxOnAnotherDiskShellScript string
//go:embed wipe-disk.sh
var wipeDiskShellScript string

//go:embed check-disk.sh
var checkDiskShellScript string

var downloadFromOciShellScript = `#!/bin/bash
# Copyright 2023 The Kubernetes Authors.
Expand Down Expand Up @@ -164,8 +167,9 @@ var (
ErrEmptyStdOut = errors.New("unexpected empty output in stdout")
// ErrTimeout means that there is a timeout error.
ErrTimeout = errors.New("i/o timeout")

errSSHDialFailed = errors.New("failed to dial ssh")
// ErrCheckDiskBrokenDisk means that a disk seams broken.
ErrCheckDiskBrokenDisk = errors.New("CheckDisk failed")
errSSHDialFailed = errors.New("failed to dial ssh")
)

// Input defines an SSH input.
Expand Down Expand Up @@ -263,6 +267,10 @@ type Client interface {
// Erase filesystem, raid and partition-table signatures.
// String "all" will wipe all disks.
WipeDisk(ctx context.Context, sliceOfWwns []string) (string, error)

// CheckDisk checks the given disks via smartctl.
// ErrCheckDiskBrokenDisk gets returned, if a disk is broken.
CheckDisk(ctx context.Context, sliceOfWwns []string) (info string, err error)
}

// Factory is the interface for creating new Client objects.
Expand Down Expand Up @@ -606,6 +614,34 @@ chmod a+rx /root/wipe-disk.sh
return out.String(), nil
}

func (c *sshClient) CheckDisk(_ context.Context, sliceOfWwns []string) (info string, err error) {
if len(sliceOfWwns) == 0 {
return "", nil
}

out := c.runSSH(fmt.Sprintf(`cat >/root/check-disk.sh <<'EOF_VIA_SSH'
%s
EOF_VIA_SSH
chmod a+rx /root/check-disk.sh
/root/check-disk.sh %s
`, checkDiskShellScript, strings.Join(sliceOfWwns, " ")))
exitStatus, err := out.ExitStatus()
if err != nil {
// Network error or similar. Script was not called.
return "", fmt.Errorf("CheckDisk for %+v failed: %w", sliceOfWwns, err)
}
if exitStatus == 1 {
// Script detected a broken disk.
return "", fmt.Errorf("CheckDisk for %+v failed: %s. %s. %w %w", sliceOfWwns, out.StdOut, out.StdErr, out.Err, ErrCheckDiskBrokenDisk)
}
if exitStatus == 0 {
// Everything was fine.
return out.String(), nil
}
// Some other strange error like "unknown WWN"
return "", fmt.Errorf("CheckDisk for %+v failed: %s. %s: %w", sliceOfWwns, out.StdOut, out.StdErr, out.Err)
}

func (c *sshClient) UntarTGZ() Output {
// read tgz from container image.
fileName := "/installimage.tgz"
Expand Down
29 changes: 29 additions & 0 deletions pkg/services/baremetal/host/host.go
Original file line number Diff line number Diff line change
Expand Up @@ -1097,6 +1097,35 @@ func (s *Service) actionImageInstalling(ctx context.Context) actionResult {
}

func (s *Service) actionImageInstallingStartBackgroundProcess(ctx context.Context, sshClient sshclient.Client) actionResult {
// CheckDisk before accessing the disk
info, err := sshClient.CheckDisk(ctx, s.scope.HetznerBareMetalHost.Spec.RootDeviceHints.ListOfWWN())
if err != nil {
_, ok := s.scope.HetznerBareMetalHost.Annotations[infrav1.IgnoreCheckDiskAnnotation]
if !ok {
// The annotation is not set. This is a permanent error.
msg := fmt.Sprintf(
"CheckDisk failed (permanent error): %s (set annotation %q on hbmh to continue anyway)",
err.Error(), infrav1.IgnoreCheckDiskAnnotation)
conditions.MarkFalse(
s.scope.HetznerBareMetalHost,
infrav1.ProvisionSucceededCondition,
infrav1.CheckDiskFailedReason,
clusterv1.ConditionSeverityError,
msg,
)
record.Warn(s.scope.HetznerBareMetalHost, infrav1.CheckDiskFailedReason, msg)
s.scope.HetznerBareMetalHost.SetError(infrav1.PermanentError, msg)
return actionStop{}
}
// The annotation was set. Just create a warning and move on.
record.Warnf(s.scope.HetznerBareMetalHost, infrav1.CheckDiskFailedReason,
"CheckDisk failed. Continue anyway because %q is set: %s",
infrav1.IgnoreCheckDiskAnnotation,
err.Error())
} else {
record.Eventf(s.scope.HetznerBareMetalHost, "DiskHealthy", "Disk looks healthy: %s", info)
}

// Call WipeDisk if the corresponding annotation is set.
sliceOfWwns := strings.Fields(s.scope.HetznerBareMetalHost.Annotations[infrav1.WipeDiskAnnotation])
if len(sliceOfWwns) > 0 {
Expand Down

0 comments on commit c291712

Please sign in to comment.