Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

honor rsm update with no time when agent receives new GS #3015

Merged
merged 8 commits into from
Jan 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions azurelinuxagent/common/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,15 @@ def __init__(self, msg=None, inner=None):
super(AgentUpdateError, self).__init__(msg, inner)


class AgentFamilyMissingError(AgentError):
"""
When agent family is missing.
"""

def __init__(self, msg=None, inner=None):
super(AgentFamilyMissingError, self).__init__(msg, inner)


class CGroupsException(AgentError):
"""
Exception to classify any cgroups related issue.
Expand Down
52 changes: 33 additions & 19 deletions azurelinuxagent/ga/agent_update_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,16 @@
# limitations under the License.
#
# Requires Python 2.6+ and Openssl 1.0+
import datetime
import os

from azurelinuxagent.common import conf, logger
from azurelinuxagent.common.event import add_event, WALAEventOperation
from azurelinuxagent.common.exception import AgentUpgradeExitException, AgentUpdateError
from azurelinuxagent.common.exception import AgentUpgradeExitException, AgentUpdateError, AgentFamilyMissingError
from azurelinuxagent.common.future import ustr
from azurelinuxagent.common.protocol.restapi import VMAgentUpdateStatuses, VMAgentUpdateStatus, VERSION_0
from azurelinuxagent.common.utils import textutil
from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
from azurelinuxagent.common.version import get_daemon_version
from azurelinuxagent.ga.ga_version_updater import RSMUpdates
from azurelinuxagent.ga.rsm_version_updater import RSMVersionUpdater
from azurelinuxagent.ga.self_update_version_updater import SelfUpdateVersionUpdater

Expand Down Expand Up @@ -67,7 +65,7 @@ def __init__(self, protocol):

# restore the state of rsm update. Default to self-update if last update is not with RSM.
if not self._get_is_last_update_with_rsm():
self._updater = SelfUpdateVersionUpdater(self._gs_id, datetime.datetime.min)
self._updater = SelfUpdateVersionUpdater(self._gs_id)
else:
self._updater = RSMVersionUpdater(self._gs_id, self._daemon_version)

Expand Down Expand Up @@ -117,7 +115,7 @@ def _get_agent_family_manifest(self, goal_state):
"""
Get the agent_family from last GS for the given family
Returns: first entry of Manifest
Exception if no manifests found in the last GS
Exception if no manifests found in the last GS and log it only on new goal state
"""
family = self._ga_family_type
agent_families = goal_state.extensions_goal_state.agent_families
Expand All @@ -130,11 +128,13 @@ def _get_agent_family_manifest(self, goal_state):
agent_family_manifests.append(m)

if not family_found:
raise AgentUpdateError(u"Agent family: {0} not found in the goal state: {1}, skipping agent update".format(family, self._gs_id))
raise AgentFamilyMissingError(u"Agent family: {0} not found in the goal state: {1}, skipping agent update \n"
u"[Note: This error is permanent for this goal state and Will not log same error until we receive new goal state]".format(family, self._gs_id))

if len(agent_family_manifests) == 0:
raise AgentUpdateError(
u"No manifest links found for agent family: {0} for goal state: {1}, skipping agent update".format(
raise AgentFamilyMissingError(
u"No manifest links found for agent family: {0} for goal state: {1}, skipping agent update \n"
u"[Note: This error is permanent for this goal state and will not log same error until we receive new goal state]".format(
family, self._gs_id))
return agent_family_manifests[0]

Expand All @@ -145,30 +145,38 @@ def run(self, goal_state, ext_gs_updated):
if not conf.get_autoupdate_enabled() or not conf.get_download_new_agents():
return

# verify if agent update is allowed this time (RSM checks new goal state; self-update checks manifest download interval)
if not self._updater.is_update_allowed_this_time(ext_gs_updated):
return
# Update the state only on new goal state
if ext_gs_updated:
self._gs_id = goal_state.extensions_goal_state.id
self._updater.sync_new_gs_id(self._gs_id)

self._gs_id = goal_state.extensions_goal_state.id
agent_family = self._get_agent_family_manifest(goal_state)
Copy link
Contributor Author

@nagworld9 nagworld9 Jan 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still decide to read properties from goal state on every iteration as goal state is in memory and don't see a big deal.

Having to call only on new goal state will need to save the state of agent family and also need to maintain state of last attempt has agent family missing error, so that we don't procced with update. I don't like the idea of saving extra state, at some point it will become convoluted.

So I raise different exception and handling it in exception block.


# updater will return RSM enabled or disabled if we need to switch to self-update or rsm update
updater_mode = self._updater.check_and_switch_updater_if_changed(agent_family, self._gs_id, ext_gs_updated)
# Updater will return True or False if we need to switch the updater
# If self-updater receives RSM update enabled, it will switch to RSM updater
# If RSM updater receives RSM update disabled, it will switch to self-update
# No change in updater if GS not updated
is_rsm_update_enabled = self._updater.is_rsm_update_enabled(agent_family, ext_gs_updated)

if updater_mode == RSMUpdates.Disabled:
if not is_rsm_update_enabled and isinstance(self._updater, RSMVersionUpdater):
msg = "VM not enabled for RSM updates, switching to self-update mode"
logger.info(msg)
add_event(op=WALAEventOperation.AgentUpgrade, message=msg, log_event=False)
self._updater = SelfUpdateVersionUpdater(self._gs_id, datetime.datetime.now())
self._updater = SelfUpdateVersionUpdater(self._gs_id)
self._remove_rsm_update_state()

if updater_mode == RSMUpdates.Enabled:
if is_rsm_update_enabled and isinstance(self._updater, SelfUpdateVersionUpdater):
msg = "VM enabled for RSM updates, switching to RSM update mode"
logger.info(msg)
add_event(op=WALAEventOperation.AgentUpgrade, message=msg, log_event=False)
self._updater = RSMVersionUpdater(self._gs_id, self._daemon_version)
self._save_rsm_update_state()

# If updater is changed in previous step, we allow update as it consider as first attempt. If not, it checks below condition
# RSM checks new goal state; self-update checks manifest download interval
if not self._updater.is_update_allowed_this_time(ext_gs_updated):
return

self._updater.retrieve_agent_version(agent_family, goal_state)

if not self._updater.is_retrieved_version_allowed_to_update(agent_family):
Expand All @@ -183,14 +191,20 @@ def run(self, goal_state, ext_gs_updated):
self._updater.proceed_with_update()

except Exception as err:
log_error = True
if isinstance(err, AgentUpgradeExitException):
raise err
elif isinstance(err, AgentUpdateError):
error_msg = ustr(err)
elif isinstance(err, AgentFamilyMissingError):
error_msg = ustr(err)
# Agent family missing error is permanent in the given goal state, so we don't want to log it on every iteration of main loop if there is no new goal state
log_error = ext_gs_updated
else:
error_msg = "Unable to update Agent: {0}".format(textutil.format_exception(err))
logger.warn(error_msg)
add_event(op=WALAEventOperation.AgentUpgrade, is_success=False, message=error_msg, log_event=False)
if log_error:
logger.warn(error_msg)
add_event(op=WALAEventOperation.AgentUpgrade, is_success=False, message=error_msg, log_event=False)
self._last_attempted_update_error_msg = error_msg

def get_vmagent_update_status(self):
Expand Down
25 changes: 11 additions & 14 deletions azurelinuxagent/ga/ga_version_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,6 @@
from azurelinuxagent.ga.guestagent import GuestAgent


class RSMUpdates(object):
"""
Enum for switching between RSM updates and self updates
"""
Enabled = "Enabled"
Disabled = "Disabled"


class GAVersionUpdater(object):

def __init__(self, gs_id):
Expand All @@ -53,15 +45,13 @@ def is_update_allowed_this_time(self, ext_gs_updated):
"""
raise NotImplementedError

def check_and_switch_updater_if_changed(self, agent_family, gs_id, ext_gs_updated):
def is_rsm_update_enabled(self, agent_family, ext_gs_updated):
"""
checks and raise the updater exception if we need to switch to self-update from rsm update or vice versa
return True if we need to switch to RSM-update from self-update and vice versa.
@param agent_family: agent family
@param gs_id: incarnation of the goal state
@param ext_gs_updated: True if extension goal state updated else False
@return: RSMUpdates.Disabled: return when agent need to stop rsm updates and switch to self-update
RSMUpdates.Enabled: return when agent need to switch to rsm update
None: return when no need to switch
@return: False when agent need to stop rsm updates
True: when agent need to switch to rsm update
"""
raise NotImplementedError

Expand Down Expand Up @@ -107,6 +97,13 @@ def version(self):
"""
return self._version

def sync_new_gs_id(self, gs_id):
"""
Update gs_id
@param gs_id: goal state id
"""
self._gs_id = gs_id

def download_and_get_new_agent(self, protocol, agent_family, goal_state):
"""
Function downloads the new agent and returns the downloaded version.
Expand Down
15 changes: 7 additions & 8 deletions azurelinuxagent/ga/rsm_version_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from azurelinuxagent.common.exception import AgentUpgradeExitException, AgentUpdateError
from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
from azurelinuxagent.common.version import CURRENT_VERSION, AGENT_NAME
from azurelinuxagent.ga.ga_version_updater import GAVersionUpdater, RSMUpdates
from azurelinuxagent.ga.ga_version_updater import GAVersionUpdater
from azurelinuxagent.ga.guestagent import GuestAgent


Expand All @@ -49,24 +49,23 @@ def is_update_allowed_this_time(self, ext_gs_updated):
"""
return ext_gs_updated

def check_and_switch_updater_if_changed(self, agent_family, gs_id, ext_gs_updated):
def is_rsm_update_enabled(self, agent_family, ext_gs_updated):
"""
Checks if there is a new goal state and decide if we need to continue with rsm update or switch to self-update.
Firstly it checks agent supports GA versioning or not. If not, we return rsm updates disabled to switch to self-update.
if vm is enabled for RSM updates and continue with rsm update, otherwise we return rsm updates disabled to switch to self-update.
Firstly it checks agent supports GA versioning or not. If not, we return false to switch to self-update.
if vm is enabled for RSM updates and continue with rsm update, otherwise we return false to switch to self-update.
if either isVersionFromRSM or isVMEnabledForRSMUpgrades or version is missing in the goal state, we ignore the update as we consider it as invalid goal state.
"""
if ext_gs_updated:
self._gs_id = gs_id
if not conf.get_enable_ga_versioning():
return RSMUpdates.Disabled
return False

if agent_family.is_vm_enabled_for_rsm_upgrades is None:
raise AgentUpdateError(
"Received invalid goal state:{0}, missing isVMEnabledForRSMUpgrades property. So, skipping agent update".format(
self._gs_id))
elif not agent_family.is_vm_enabled_for_rsm_upgrades:
return RSMUpdates.Disabled
return False
else:
if agent_family.is_version_from_rsm is None:
raise AgentUpdateError(
Expand All @@ -77,7 +76,7 @@ def check_and_switch_updater_if_changed(self, agent_family, gs_id, ext_gs_update
"Received invalid goal state:{0}, missing version property. So, skipping agent update".format(
self._gs_id))

return None
return True

def retrieve_agent_version(self, agent_family, goal_state):
"""
Expand Down
15 changes: 7 additions & 8 deletions azurelinuxagent/ga/self_update_version_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from azurelinuxagent.common.exception import AgentUpgradeExitException, AgentUpdateError
from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
from azurelinuxagent.common.version import CURRENT_VERSION
from azurelinuxagent.ga.ga_version_updater import GAVersionUpdater, RSMUpdates
from azurelinuxagent.ga.ga_version_updater import GAVersionUpdater


class SelfUpdateType(object):
Expand All @@ -35,9 +35,9 @@ class SelfUpdateType(object):


class SelfUpdateVersionUpdater(GAVersionUpdater):
def __init__(self, gs_id, last_attempted_manifest_download_time):
def __init__(self, gs_id):
super(SelfUpdateVersionUpdater, self).__init__(gs_id)
self._last_attempted_manifest_download_time = last_attempted_manifest_download_time
self._last_attempted_manifest_download_time = datetime.datetime.min
self._last_attempted_self_update_time = datetime.datetime.min

@staticmethod
Expand Down Expand Up @@ -119,14 +119,13 @@ def is_update_allowed_this_time(self, ext_gs_updated):
return False
return True

def check_and_switch_updater_if_changed(self, agent_family, gs_id, ext_gs_updated):
def is_rsm_update_enabled(self, agent_family, ext_gs_updated):
"""
Checks if there is a new goal state and decide if we need to continue with self-update or switch to rsm update.
if vm is not enabled for RSM updates or agent not supports GA versioning then we continue with self update, otherwise we rsm enabled to switch to rsm update.
if vm is not enabled for RSM updates or agent not supports GA versioning then we continue with self update, otherwise we return true to switch to rsm update.
if isVersionFromRSM is missing but isVMEnabledForRSMUpgrades is present in the goal state, we ignore the update as we consider it as invalid goal state.
"""
if ext_gs_updated:
self._gs_id = gs_id
if conf.get_enable_ga_versioning() and agent_family.is_vm_enabled_for_rsm_upgrades is not None and agent_family.is_vm_enabled_for_rsm_upgrades:
if agent_family.is_version_from_rsm is None:
raise AgentUpdateError(
Expand All @@ -137,9 +136,9 @@ def check_and_switch_updater_if_changed(self, agent_family, gs_id, ext_gs_update
raise AgentUpdateError(
"Received invalid goal state:{0}, missing version property. So, skipping agent update".format(
self._gs_id))
return RSMUpdates.Enabled
return True

return None
return False

def retrieve_agent_version(self, agent_family, goal_state):
"""
Expand Down
10 changes: 10 additions & 0 deletions tests/ga/test_agent_update_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,16 @@ def test_handles_missing_agent_family(self):
'message'] and kwarg[
'op'] == WALAEventOperation.AgentUpgrade]), "Agent manifest should not be in GS")

# making multiple agent update attempts and assert only one time logged
agent_update_handler.run(agent_update_handler._protocol.get_goal_state(), False)
agent_update_handler.run(agent_update_handler._protocol.get_goal_state(), False)

self.assertEqual(1, len([kwarg['message'] for _, kwarg in mock_telemetry.call_args_list if
"No manifest links found for agent family" in kwarg[
'message'] and kwarg[
'op'] == WALAEventOperation.AgentUpgrade]),
"Agent manifest error should be logged once if it's same goal state")

def test_it_should_report_update_status_with_success(self):
data_file = DATA_FILE.copy()
data_file["ext_conf"] = "wire/ext_conf_rsm_version.xml"
Expand Down
6 changes: 4 additions & 2 deletions tests/ga/test_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -1784,13 +1784,14 @@ def test_it_should_not_download_anything_if_rsm_version_is_current_version(self)
self.assertFalse(os.path.exists(self.agent_dir("99999.0.0.0")),
"New agent directory should not be found")

def test_it_should_skip_wait_to_update_if_rsm_version_available(self):
def test_it_should_skip_wait_to_update_immediately_if_rsm_version_available(self):
no_of_iterations = 100

def reload_conf(url, protocol):
mock_wire_data = protocol.mock_wire_data

# This function reloads the conf mid-run to mimic an actual customer scenario
# Setting the rsm request to be sent after some iterations
if HttpRequestPredicates.is_goal_state_request(url) and mock_wire_data.call_counts["goalstate"] >= 5:
reload_conf.call_count += 1

Expand All @@ -1808,7 +1809,8 @@ def reload_conf(url, protocol):

data_file = wire_protocol_data.DATA_FILE.copy()
data_file['ga_manifest'] = "wire/ga_manifest_no_upgrade.xml"
with self.__get_update_handler(iterations=no_of_iterations, test_data=data_file, reload_conf=reload_conf) as (update_handler, mock_telemetry):
# Setting the prod frequency to mimic a real scenario
with self.__get_update_handler(iterations=no_of_iterations, test_data=data_file, reload_conf=reload_conf, autoupdate_frequency=6000) as (update_handler, mock_telemetry):
update_handler._protocol.mock_wire_data.set_ga_manifest_version_version(str(CURRENT_VERSION))
update_handler._protocol.mock_wire_data.set_incarnation(20)
update_handler.run(debug=True)
Expand Down
Loading