Skip to content

Commit

Permalink
Recover primary nic if down after publishing hostname in RedhatOSUtil (
Browse files Browse the repository at this point in the history
…#3024)

* Check nic state and recover if down:

* Fix typo

* Fix state comparison

* Fix pylint errors

* Fix string comparison

* Report publish hostname failure in calling thread

* Add todo to check nic state for all distros where we reset network

* Update detection to check connection state and separate recover from publish

* Pylint unused argument

* refactor recover_nic argument

* Network interface e2e test

* e2e test for recovering the network interface on redhat distros

* Only run scenario on distros which use RedhatOSUtil

* Fix call to parent publish_hostname to include recover_nic arg

* Update comments in default os util

* Remove comment

* Fix comment

* Do not do detection/recover on RedhatOSMOdernUtil

* Resolve PR comments

* Make script executable

* Revert pypy change

* Fix publish hostname paramters
  • Loading branch information
maddieford authored Feb 5, 2024
1 parent 25d7103 commit 0cd8617
Show file tree
Hide file tree
Showing 11 changed files with 308 additions and 14 deletions.
1 change: 1 addition & 0 deletions azurelinuxagent/common/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ class WALAEventOperation:
HealthCheck = "HealthCheck"
HealthObservation = "HealthObservation"
HeartBeat = "HeartBeat"
HostnamePublishing = "HostnamePublishing"
HostPlugin = "HostPlugin"
HostPluginHeartbeat = "HostPluginHeartbeat"
HostPluginHeartbeatExtended = "HostPluginHeartbeatExtended"
Expand Down
11 changes: 10 additions & 1 deletion azurelinuxagent/common/osutil/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -1190,11 +1190,20 @@ def restart_if(self, ifname, retries=3, wait=5):
else:
logger.warn("exceeded restart retries")

def publish_hostname(self, hostname):
def check_and_recover_nic_state(self, ifname):
# TODO: This should be implemented for all distros where we reset the network during publishing hostname. Currently it is only implemented in RedhatOSUtil.
pass

def publish_hostname(self, hostname, recover_nic=False):
"""
Publishes the provided hostname.
"""
self.set_dhcp_hostname(hostname)
self.set_hostname_record(hostname)
ifname = self.get_if_name()
self.restart_if(ifname)
if recover_nic:
self.check_and_recover_nic_state(ifname)

def set_scsi_disks_timeout(self, timeout):
for dev in os.listdir("/sys/block"):
Expand Down
2 changes: 1 addition & 1 deletion azurelinuxagent/common/osutil/gaia.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def set_hostname(self, hostname):
def set_dhcp_hostname(self, hostname):
logger.warn('set_dhcp_hostname is ignored on GAiA')

def publish_hostname(self, hostname):
def publish_hostname(self, hostname, recover_nic=False):
logger.warn('publish_hostname is ignored on GAiA')

def del_account(self, username):
Expand Down
4 changes: 2 additions & 2 deletions azurelinuxagent/common/osutil/iosxe.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,12 @@ def set_hostname(self, hostname):
logger.warn("[{0}] failed with error: {1}, attempting fallback".format(' '.join(hostnamectl_cmd), ustr(e)))
DefaultOSUtil.set_hostname(self, hostname)

def publish_hostname(self, hostname):
def publish_hostname(self, hostname, recover_nic=False):
"""
Restart NetworkManager first before publishing hostname
"""
shellutil.run("service NetworkManager restart")
super(IosxeOSUtil, self).publish_hostname(hostname)
super(IosxeOSUtil, self).publish_hostname(hostname, recover_nic)

def register_agent_service(self):
return shellutil.run("systemctl enable waagent", chk_err=False)
Expand Down
89 changes: 81 additions & 8 deletions azurelinuxagent/common/osutil/redhat.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,12 +117,11 @@ def set_hostname(self, hostname):
logger.warn("[{0}] failed, attempting fallback".format(' '.join(hostnamectl_cmd)))
DefaultOSUtil.set_hostname(self, hostname)

def get_nm_controlled(self):
ifname = self.get_if_name()
def get_nm_controlled(self, ifname):
filepath = "/etc/sysconfig/network-scripts/ifcfg-{0}".format(ifname)
nm_controlled_cmd = ['grep', 'NM_CONTROLLED=', filepath]
try:
result = shellutil.run_command(nm_controlled_cmd, log_error=False, encode_output=False).rstrip()
result = shellutil.run_command(nm_controlled_cmd, log_error=False).rstrip()

if result and len(result.split('=')) > 1:
# Remove trailing white space and ' or " characters
Expand All @@ -140,17 +139,87 @@ def get_nm_controlled(self):

return True

def publish_hostname(self, hostname):
def get_nic_operational_and_general_states(self, ifname):
"""
Checks the contents of /sys/class/net/{ifname}/operstate and the results of 'nmcli -g general.state device show {ifname}' to determine the state of the provided interface.
Raises an exception if the network interface state cannot be determined.
"""
filepath = "/sys/class/net/{0}/operstate".format(ifname)
nic_general_state_cmd = ['nmcli', '-g', 'general.state', 'device', 'show', ifname]
if not os.path.isfile(filepath):
msg = "Unable to determine primary network interface {0} state, because state file does not exist: {1}".format(ifname, filepath)
logger.warn(msg)
raise Exception(msg)

try:
nic_oper_state = fileutil.read_file(filepath).rstrip().lower()
nic_general_state = shellutil.run_command(nic_general_state_cmd, log_error=True).rstrip().lower()
if nic_oper_state != "up":
logger.warn("The primary network interface {0} operational state is '{1}'.".format(ifname, nic_oper_state))
else:
logger.info("The primary network interface {0} operational state is '{1}'.".format(ifname, nic_oper_state))
if nic_general_state != "100 (connected)":
logger.warn("The primary network interface {0} general state is '{1}'.".format(ifname, nic_general_state))
else:
logger.info("The primary network interface {0} general state is '{1}'.".format(ifname, nic_general_state))
return nic_oper_state, nic_general_state
except Exception as e:
msg = "Unexpected error while determining the primary network interface state: {0}".format(e)
logger.warn(msg)
raise Exception(msg)

def check_and_recover_nic_state(self, ifname):
"""
Checks if the provided network interface is in an 'up' state. If the network interface is in a 'down' state,
attempt to recover the interface by restarting the Network Manager service.
Raises an exception if an attempt to bring the interface into an 'up' state fails, or if the state
of the network interface cannot be determined.
"""
nic_operstate, nic_general_state = self.get_nic_operational_and_general_states(ifname)
if nic_operstate == "down" or "disconnected" in nic_general_state:
logger.info("Restarting the Network Manager service to recover network interface {0}".format(ifname))
self.restart_network_manager()
# Interface does not come up immediately after NetworkManager restart. Wait 5 seconds before checking
# network interface state.
time.sleep(5)
nic_operstate, nic_general_state = self.get_nic_operational_and_general_states(ifname)
# It is possible for network interface to be in an unknown or unmanaged state. Log warning if state is not
# down, disconnected, up, or connected
if nic_operstate != "up" or nic_general_state != "100 (connected)":
msg = "Network Manager restart failed to bring network interface {0} into 'up' and 'connected' state".format(ifname)
logger.warn(msg)
raise Exception(msg)
else:
logger.info("Network Manager restart successfully brought the network interface {0} into 'up' and 'connected' state".format(ifname))
elif nic_operstate != "up" or nic_general_state != "100 (connected)":
# We already logged a warning with the network interface state in get_nic_operstate(). Raise an exception
# for the env thread to send to telemetry.
raise Exception("The primary network interface {0} operational state is '{1}' and general state is '{2}'.".format(ifname, nic_operstate, nic_general_state))

def restart_network_manager(self):
shellutil.run("service NetworkManager restart")

def publish_hostname(self, hostname, recover_nic=False):
"""
Restart NetworkManager first before publishing hostname, only if the network interface is not controlled by the
NetworkManager service (as determined by NM_CONTROLLED=n in the interface configuration). If the NetworkManager
service is restarted before the agent publishes the hostname, and NM_controlled=y, a race condition may happen
between the NetworkManager service and the Guest Agent making changes to the network interface configuration
simultaneously.
Note: check_and_recover_nic_state(ifname) raises an Exception if an attempt to recover the network interface
fails, or if the network interface state cannot be determined. Callers should handle this exception by sending
an event to telemetry.
TODO: Improve failure reporting and add success reporting to telemetry for hostname changes. Right now we are only reporting failures to telemetry by raising an Exception in publish_hostname for the calling thread to handle by reporting the failure to telemetry.
"""
if not self.get_nm_controlled():
shellutil.run("service NetworkManager restart")
super(RedhatOSUtil, self).publish_hostname(hostname)
ifname = self.get_if_name()
nm_controlled = self.get_nm_controlled(ifname)
if not nm_controlled:
self.restart_network_manager()
# TODO: Current recover logic is only effective when the NetworkManager manages the network interface. Update the recover logic so it is effective even when NM_CONTROLLED=n
super(RedhatOSUtil, self).publish_hostname(hostname, recover_nic and nm_controlled)

def register_agent_service(self):
return shellutil.run("systemctl enable {0}".format(self.service_name), chk_err=False)
Expand Down Expand Up @@ -193,7 +262,11 @@ def restart_if(self, ifname, retries=3, wait=5):
else:
logger.warn("exceeded restart retries")

def publish_hostname(self, hostname):
def check_and_recover_nic_state(self, ifname):
# TODO: Implement and test a way to recover the network interface for RedhatOSModernUtil
pass

def publish_hostname(self, hostname, recover_nic=False):
# RedhatOSUtil was updated to conditionally run NetworkManager restart in response to a race condition between
# NetworkManager restart and the agent restarting the network interface during publish_hostname. Keeping the
# NetworkManager restart in RedhatOSModernUtil because the issue was not reproduced on these versions.
Expand Down
2 changes: 1 addition & 1 deletion azurelinuxagent/common/osutil/suse.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def __init__(self):
super(SUSEOSUtil, self).__init__()
self.dhclient_name = 'wickedd-dhcp4'

def publish_hostname(self, hostname):
def publish_hostname(self, hostname, recover_nic=False):
self.set_dhcp_hostname(hostname)
self.set_hostname_record(hostname)
ifname = self.get_if_name()
Expand Down
6 changes: 5 additions & 1 deletion azurelinuxagent/ga/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,11 @@ def _operation(self):
self._hostname,
curr_hostname)
self._osutil.set_hostname(curr_hostname)
self._osutil.publish_hostname(curr_hostname)
try:
self._osutil.publish_hostname(curr_hostname, recover_nic=True)
except Exception as e:
msg = "Error while publishing the hostname: {0}".format(e)
add_event(AGENT_NAME, op=WALAEventOperation.HostnamePublishing, is_success=False, message=msg, log_event=False)
self._hostname = curr_hostname


Expand Down
8 changes: 8 additions & 0 deletions tests_e2e/test_suites/images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ images:
locations:
AzureChinaCloud: []
centos_610: "OpenLogic CentOS 6.10 latest"
centos_75: "OpenLogic CentOS 7.5 latest"
centos_79: "OpenLogic CentOS 7_9 latest"
centos_82:
urn: "OpenLogic CentOS 8_2 latest"
Expand Down Expand Up @@ -121,7 +122,14 @@ images:
AzureChinaCloud: []
AzureUSGovernment: []
oracle_610: "Oracle Oracle-Linux 6.10 latest"
oracle_75: "Oracle Oracle-Linux 7.5 latest"
oracle_79: "Oracle Oracle-Linux ol79-gen2 latest"
oracle_82: "Oracle Oracle-Linux ol82-gen2 latest"
rhel_610: "RedHat RHEL 6.10 latest"
rhel_75:
urn: "RedHat RHEL 7.5 latest"
locations:
AzureChinaCloud: []
rhel_79:
urn: "RedHat RHEL 7_9 latest"
locations:
Expand Down
17 changes: 17 additions & 0 deletions tests_e2e/test_suites/recover_network_interface.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#
# Brings the primary network interface down and checks that the agent can recover the network.
#
name: "RecoverNetworkInterface"
tests:
- "recover_network_interface/recover_network_interface.py"
images:
# TODO: This scenario should be run on all distros which bring the network interface down to publish hostname. Currently, only RedhatOSUtil attempts to recover the network interface if down after hostname publishing.
- "centos_79"
- "centos_75"
- "centos_82"
- "rhel_75"
- "rhel_79"
- "rhel_82"
- "oracle_75"
- "oracle_79"
- "oracle_82"
143 changes: 143 additions & 0 deletions tests_e2e/tests/recover_network_interface/recover_network_interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#!/usr/bin/env python3

# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

#
# This test uses CSE to bring the network down and call check_and_recover_nic_state to bring the network back into an
# 'up' and 'connected' state. The intention of the test is to alert us if there is some change in newer distros which
# affects this logic.
#

import json
from typing import List, Dict, Any

from assertpy import fail, assert_that
from time import sleep

from tests_e2e.tests.lib.agent_test import AgentVmTest, TestSkipped
from tests_e2e.tests.lib.agent_test_context import AgentVmTestContext
from tests_e2e.tests.lib.logging import log
from tests_e2e.tests.lib.virtual_machine_extension_client import VirtualMachineExtensionClient
from tests_e2e.tests.lib.vm_extension_identifier import VmExtensionIds


class RecoverNetworkInterface(AgentVmTest):
def __init__(self, context: AgentVmTestContext):
super().__init__(context)
self._context = context
self._ssh_client = context.create_ssh_client()
self._private_ip = context.vm.get_private_ip_address()
self._vm_password = ""

def add_vm_password(self):
# Add password to VM to help with debugging in case of failure
# REMOVE PWD FROM LOGS IF WE EVER MAKE THESE RUNS/LOGS PUBLIC
username = self._ssh_client.username
pwd = self._ssh_client.run_command("openssl rand -base64 32 | tr : .").rstrip()
self._vm_password = pwd
log.info("VM Username: {0}; VM Password: {1}".format(username, pwd))
self._ssh_client.run_command("echo '{0}:{1}' | sudo -S chpasswd".format(username, pwd))

def check_agent_reports_status(self):
status_updated = False
last_agent_status_time = self._context.vm.get_instance_view().vm_agent.statuses[0].time
log.info("Agent reported status at {0}".format(last_agent_status_time))
retries = 3

while retries > 0 and not status_updated:
agent_status_time = self._context.vm.get_instance_view().vm_agent.statuses[0].time
if agent_status_time != last_agent_status_time:
status_updated = True
log.info("Agent reported status at {0}".format(last_agent_status_time))
else:
retries -= 1
sleep(60)

if not status_updated:
fail("Agent hasn't reported status since {0} and ssh connection failed. Use the serial console in portal "
"to debug".format(last_agent_status_time))

def run(self):
# Add password to VM and log. This allows us to debug with serial console if necessary
log.info("")
log.info("Adding password to the VM to use for debugging in case necessary...")
self.add_vm_password()

# Skip the test if NM_CONTROLLED=n. The current recover logic does not work in this case
result = self._ssh_client.run_command("recover_network_interface-get_nm_controlled.py", use_sudo=True)
if "Interface is NOT NM controlled" in result:
raise TestSkipped("Current recover method will not work on interfaces where NM_Controlled=n")

# Get the primary network interface name
ifname = self._ssh_client.run_command("pypy3 -c 'from azurelinuxagent.common.osutil.redhat import RedhatOSUtil; print(RedhatOSUtil().get_if_name())'").rstrip()
# The interface name needs to be in double quotes for the pypy portion of the script
formatted_ifname = f'"{ifname}"'

# The script should bring the primary network interface down and use the agent to recover the interface. These
# commands will bring the network down, so they should be executed on the machine using CSE instead of ssh.
script = f"""
set -euxo pipefail
ifdown {ifname};
nic_state=$(nmcli -g general.state device show {ifname})
echo Primary network interface state before recovering: $nic_state
source /home/{self._context.username}/bin/set-agent-env;
pypy3 -c 'from azurelinuxagent.common.osutil.redhat import RedhatOSUtil; RedhatOSUtil().check_and_recover_nic_state({formatted_ifname})';
nic_state=$(nmcli -g general.state device show {ifname});
echo Primary network interface state after recovering: $nic_state
"""
log.info("")
log.info("Using CSE to bring the primary network interface down and call the OSUtil to bring the interface back up. Command to execute: {0}".format(script))
custom_script = VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.CustomScript, resource_name="CustomScript")
custom_script.enable(
protected_settings={
'commandToExecute': script
}
)

# Check that the interface was down and brought back up in instance view
log.info("")
log.info("Checking the instance view to confirm the primary network interface was brought down and successfully recovered by the agent...")
instance_view = custom_script.get_instance_view()
log.info("Instance view for custom script after enable is: {0}".format(json.dumps(instance_view.serialize(), indent=4)))
assert_that(len(instance_view.statuses)).described_as("Instance view should have a status for CustomScript").is_greater_than(0)
assert_that(instance_view.statuses[0].message).described_as("The primary network interface should be in a disconnected state before the attempt to recover").contains("Primary network interface state before recovering: 30 (disconnected)")
assert_that(instance_view.statuses[0].message).described_as("The primary network interface should be in a connected state after the attempt to recover").contains("Primary network interface state after recovering: 100 (connected)")

# Check that the agent is successfully reporting status after recovering the network
log.info("")
log.info("Checking that the agent is reporting status after recovering the network...")
self.check_agent_reports_status()

log.info("")
log.info("The primary network interface was successfully recovered by the agent.")

def get_ignore_error_rules(self) -> List[Dict[str, Any]]:
ignore_rules = [
#
# We may see temporary network unreachable warnings since we are bringing the network interface down
# 2024-02-01T23:40:03.563499Z ERROR ExtHandler ExtHandler Error fetching the goal state: [ProtocolError] GET vmSettings [correlation ID: ac21bdd7-1a7a-4bba-b307-b9d5bc30da33 eTag: 941323814975149980]: Request failed: [Errno 101] Network is unreachable
#
{
'message': r"Error fetching the goal state: \[ProtocolError\] GET vmSettings.*Request failed: \[Errno 101\] Network is unreachable"
}
]
return ignore_rules


if __name__ == "__main__":
RecoverNetworkInterface.run_from_command_line()
Loading

0 comments on commit 0cd8617

Please sign in to comment.