Skip to content

Commit

Permalink
Merge branch 'develop' into agent-cpu-quota
Browse files Browse the repository at this point in the history
  • Loading branch information
nagworld9 authored Oct 19, 2023
2 parents 989905a + 6e0e3f1 commit ceb1661
Show file tree
Hide file tree
Showing 21 changed files with 297 additions and 70 deletions.
21 changes: 15 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,13 @@ Waagent depends on some system packages in order to function properly:

## Installation

Installation via your distribution's package repository is preferred.
You can also customize your own RPM or DEB packages using the configuration
samples provided (see deb and rpm sections below).
Installing via your distribution's package repository is the only method that is supported.

For more advanced installation options, such as installing to custom locations or prefixes, you can use **setuptools** to install from source by running:
You can install from source for more advanced options, such as installing to a custom location or creating
custom images. Installing from source, though, may override customizations done to the Agent by your
distribution, and is meant only for advanced users. We provide very limited support for this method.

To install from source, you can use **setuptools**:

```bash
sudo python setup.py install --register-service
Expand All @@ -108,11 +110,18 @@ You can view more installation options by running:

The agent's log file is kept at `/var/log/waagent.log`.

Lastly, you can also customize your own RPM or DEB packages using the configuration
samples provided in the deb and rpm sections below. This method is also meant for advanced users and we
provide very limited support for it.


## Upgrade

Upgrading via your distribution's package repository is strongly preferred.
Upgrading via your distribution's package repository or using automatic updates are the only supported
methods. More information can be found here: [Update Linux Agent](https://learn.microsoft.com/en-us/azure/virtual-machines/extensions/update-linux-agent)

If upgrading manually, same with installation above by running:
To upgrade the Agent from source, you can use **setuptools**. Upgrading from source is meant for advanced
users and we provide very limited support for it.

```bash
sudo python setup.py install --force
Expand Down
30 changes: 22 additions & 8 deletions azurelinuxagent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import sys
import threading
from azurelinuxagent.ga import logcollector, cgroupconfigurator
from azurelinuxagent.ga.cgroup import AGENT_LOG_COLLECTOR, CpuCgroup, MemoryCgroup
from azurelinuxagent.ga.cgroupapi import SystemdCgroupsApi

import azurelinuxagent.common.conf as conf
Expand Down Expand Up @@ -204,11 +205,10 @@ def collect_logs(self, is_full_mode):
logger.info("Running log collector mode normal")

# Check the cgroups unit
cpu_cgroup_path, memory_cgroup_path, log_collector_monitor = None, None, None
if CollectLogsHandler.should_validate_cgroups():
cgroups_api = SystemdCgroupsApi()
cpu_cgroup_path, memory_cgroup_path = cgroups_api.get_process_cgroup_paths("self")

log_collector_monitor = None
cgroups_api = SystemdCgroupsApi()
cpu_cgroup_path, memory_cgroup_path = cgroups_api.get_process_cgroup_paths("self")
if CollectLogsHandler.is_enabled_monitor_cgroups_check():
cpu_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in cpu_cgroup_path)
memory_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in memory_cgroup_path)

Expand All @@ -221,10 +221,24 @@ def collect_logs(self, is_full_mode):

sys.exit(logcollector.INVALID_CGROUPS_ERRCODE)

def initialize_cgroups_tracking(cpu_cgroup_path, memory_cgroup_path):
cpu_cgroup = CpuCgroup(AGENT_LOG_COLLECTOR, cpu_cgroup_path)
msg = "Started tracking cpu cgroup {0}".format(cpu_cgroup)
logger.info(msg)
cpu_cgroup.initialize_cpu_usage()
memory_cgroup = MemoryCgroup(AGENT_LOG_COLLECTOR, memory_cgroup_path)
msg = "Started tracking memory cgroup {0}".format(memory_cgroup)
logger.info(msg)
return [cpu_cgroup, memory_cgroup]

try:
log_collector = LogCollector(is_full_mode, cpu_cgroup_path, memory_cgroup_path)
log_collector_monitor = get_log_collector_monitor_handler(log_collector.cgroups)
log_collector_monitor.run()
log_collector = LogCollector(is_full_mode)
# Running log collector resource(CPU, Memory) monitoring only if agent starts the log collector.
# If Log collector start by any other means, then it will not be monitored.
if CollectLogsHandler.is_enabled_monitor_cgroups_check():
tracked_cgroups = initialize_cgroups_tracking(cpu_cgroup_path, memory_cgroup_path)
log_collector_monitor = get_log_collector_monitor_handler(tracked_cgroups)
log_collector_monitor.run()
archive = log_collector.collect_logs_and_get_archive()
logger.info("Log collection successfully completed. Archive can be found at {0} "
"and detailed log output can be found at {1}".format(archive, OUTPUT_RESULTS_FILE_PATH))
Expand Down
8 changes: 6 additions & 2 deletions azurelinuxagent/common/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ class WALAEventOperation:
LogCollection = "LogCollection"
NoExec = "NoExec"
OSInfo = "OSInfo"
OpenSsl = "OpenSsl"
Partition = "Partition"
PersistFirewallRules = "PersistFirewallRules"
PluginSettingsVersionMismatch = "PluginSettingsVersionMismatch"
Expand Down Expand Up @@ -365,10 +366,14 @@ def __init__(self):

# Parameters from OS
osutil = get_osutil()
keyword_name = {
"CpuArchitecture": osutil.get_vm_arch()
}
self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.OSVersion, EventLogger._get_os_version()))
self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.ExecutionMode, AGENT_EXECUTION_MODE))
self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.RAM, int(EventLogger._get_ram(osutil))))
self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.Processors, int(EventLogger._get_processors(osutil))))
self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.KeywordName, json.dumps(keyword_name)))

# Parameters from goal state
self._common_parameters.append(TelemetryEventParam(CommonTelemetryEventSchema.TenantName, "TenantName_UNINITIALIZED"))
Expand Down Expand Up @@ -596,8 +601,7 @@ def add_common_event_parameters(self, event, event_timestamp):
TelemetryEventParam(CommonTelemetryEventSchema.OpcodeName, event_timestamp.strftime(logger.Logger.LogTimeFormatInUTC)),
TelemetryEventParam(CommonTelemetryEventSchema.EventTid, threading.current_thread().ident),
TelemetryEventParam(CommonTelemetryEventSchema.EventPid, os.getpid()),
TelemetryEventParam(CommonTelemetryEventSchema.TaskName, threading.current_thread().getName()),
TelemetryEventParam(CommonTelemetryEventSchema.KeywordName, '')]
TelemetryEventParam(CommonTelemetryEventSchema.TaskName, threading.current_thread().getName())]

if event.eventId == TELEMETRY_EVENT_EVENT_ID and event.providerId == TELEMETRY_EVENT_PROVIDER_ID:
# Currently only the GuestAgentExtensionEvents has these columns, the other tables dont have them so skipping
Expand Down
8 changes: 8 additions & 0 deletions azurelinuxagent/common/osutil/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,14 @@ def get_systemd_unit_file_install_path():
def get_agent_bin_path():
return "/usr/sbin"

@staticmethod
def get_vm_arch():
try:
return platform.machine()
except Exception as e:
logger.warn("Unable to determine cpu architecture: {0}", ustr(e))
return "unknown"

def get_firewall_dropped_packets(self, dst_ip=None):
# If a previous attempt failed, do not retry
global _enable_firewall # pylint: disable=W0603
Expand Down
10 changes: 1 addition & 9 deletions azurelinuxagent/common/protocol/goal_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,26 +579,19 @@ def __init__(self, xml_text, my_logger):

# The parsing process use public key to match prv and crt.
buf = []
begin_crt = False # pylint: disable=W0612
begin_prv = False # pylint: disable=W0612
prvs = {}
thumbprints = {}
index = 0
v1_cert_list = []
with open(pem_file) as pem:
for line in pem.readlines():
buf.append(line)
if re.match(r'[-]+BEGIN.*KEY[-]+', line):
begin_prv = True
elif re.match(r'[-]+BEGIN.*CERTIFICATE[-]+', line):
begin_crt = True
elif re.match(r'[-]+END.*KEY[-]+', line):
if re.match(r'[-]+END.*KEY[-]+', line):
tmp_file = Certificates._write_to_tmp_file(index, 'prv', buf)
pub = cryptutil.get_pubkey_from_prv(tmp_file)
prvs[pub] = tmp_file
buf = []
index += 1
begin_prv = False
elif re.match(r'[-]+END.*CERTIFICATE[-]+', line):
tmp_file = Certificates._write_to_tmp_file(index, 'crt', buf)
pub = cryptutil.get_pubkey_from_crt(tmp_file)
Expand All @@ -613,7 +606,6 @@ def __init__(self, xml_text, my_logger):
os.rename(tmp_file, os.path.join(conf.get_lib_dir(), crt))
buf = []
index += 1
begin_crt = False

# Rename prv key with thumbprint as the file name
for pubkey in prvs:
Expand Down
19 changes: 15 additions & 4 deletions azurelinuxagent/common/utils/cryptutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,21 @@ def gen_transport_cert(self, prv_file, crt_file):
def get_pubkey_from_prv(self, file_name):
if not os.path.exists(file_name):
raise IOError(errno.ENOENT, "File not found", file_name)
else:
cmd = [self.openssl_cmd, "rsa", "-in", file_name, "-pubout"]
pub = shellutil.run_command(cmd, log_error=True)
return pub

# OpenSSL's pkey command may not be available on older versions so try 'rsa' first.
try:
command = [self.openssl_cmd, "rsa", "-in", file_name, "-pubout"]
return shellutil.run_command(command, log_error=False)
except shellutil.CommandError as error:
if not ("Not an RSA key" in error.stderr or "expecting an rsa key" in error.stderr):
logger.error(
"Command: [{0}], return code: [{1}], stdout: [{2}] stderr: [{3}]",
" ".join(command),
error.returncode,
error.stdout,
error.stderr)
raise
return shellutil.run_command([self.openssl_cmd, "pkey", "-in", file_name, "-pubout"], log_error=True)

def get_pubkey_from_crt(self, file_name):
if not os.path.exists(file_name):
Expand Down
10 changes: 5 additions & 5 deletions azurelinuxagent/ga/collect_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,16 +83,16 @@ def get_thread_name():
return CollectLogsHandler._THREAD_NAME

@staticmethod
def enable_cgroups_validation():
def enable_monitor_cgroups_check():
os.environ[CollectLogsHandler.__CGROUPS_FLAG_ENV_VARIABLE] = "1"

@staticmethod
def disable_cgroups_validation():
def disable_monitor_cgroups_check():
if CollectLogsHandler.__CGROUPS_FLAG_ENV_VARIABLE in os.environ:
del os.environ[CollectLogsHandler.__CGROUPS_FLAG_ENV_VARIABLE]

@staticmethod
def should_validate_cgroups():
def is_enabled_monitor_cgroups_check():
if CollectLogsHandler.__CGROUPS_FLAG_ENV_VARIABLE in os.environ:
return os.environ[CollectLogsHandler.__CGROUPS_FLAG_ENV_VARIABLE] == "1"
return False
Expand Down Expand Up @@ -147,7 +147,7 @@ def daemon(self):
time.sleep(_INITIAL_LOG_COLLECTION_DELAY)

try:
CollectLogsHandler.enable_cgroups_validation()
CollectLogsHandler.enable_monitor_cgroups_check()
if self.protocol_util is None or self.protocol is None:
self.init_protocols()

Expand All @@ -162,7 +162,7 @@ def daemon(self):
except Exception as e:
logger.error("An error occurred in the log collection thread; will exit the thread.\n{0}", ustr(e))
finally:
CollectLogsHandler.disable_cgroups_validation()
CollectLogsHandler.disable_monitor_cgroups_check()

def collect_and_send_logs(self):
if self._collect_logs():
Expand Down
15 changes: 1 addition & 14 deletions azurelinuxagent/ga/logcollector.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
from datetime import datetime
from heapq import heappush, heappop

from azurelinuxagent.ga.cgroup import CpuCgroup, AGENT_LOG_COLLECTOR, MemoryCgroup
from azurelinuxagent.common.conf import get_lib_dir, get_ext_log_dir, get_agent_log_file
from azurelinuxagent.common.event import initialize_event_logger_vminfo_common_parameters
from azurelinuxagent.common.future import ustr
Expand Down Expand Up @@ -71,14 +70,13 @@ class LogCollector(object):

_TRUNCATED_FILE_PREFIX = "truncated_"

def __init__(self, is_full_mode=False, cpu_cgroup_path=None, memory_cgroup_path=None):
def __init__(self, is_full_mode=False):
self._is_full_mode = is_full_mode
self._manifest = MANIFEST_FULL if is_full_mode else MANIFEST_NORMAL
self._must_collect_files = self._expand_must_collect_files()
self._create_base_dirs()
self._set_logger()
self._initialize_telemetry()
self.cgroups = self._set_resource_usage_cgroups(cpu_cgroup_path, memory_cgroup_path)

@staticmethod
def _mkdir(dirname):
Expand All @@ -105,17 +103,6 @@ def _set_logger():
_LOGGER.addHandler(_f_handler)
_LOGGER.setLevel(logging.INFO)

@staticmethod
def _set_resource_usage_cgroups(cpu_cgroup_path, memory_cgroup_path):
cpu_cgroup = CpuCgroup(AGENT_LOG_COLLECTOR, cpu_cgroup_path)
msg = "Started tracking cpu cgroup {0}".format(cpu_cgroup)
_LOGGER.info(msg)
cpu_cgroup.initialize_cpu_usage()
memory_cgroup = MemoryCgroup(AGENT_LOG_COLLECTOR, memory_cgroup_path)
msg = "Started tracking memory cgroup {0}".format(memory_cgroup)
_LOGGER.info(msg)
return [cpu_cgroup, memory_cgroup]

@staticmethod
def _initialize_telemetry():
protocol = get_protocol_util().get_protocol(init_goal_state=False)
Expand Down
40 changes: 33 additions & 7 deletions azurelinuxagent/ga/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,17 +315,22 @@ def run(self, debug=False):
logger.info("OS: {0} {1}", DISTRO_NAME, DISTRO_VERSION)
logger.info("Python: {0}.{1}.{2}", PY_VERSION_MAJOR, PY_VERSION_MINOR, PY_VERSION_MICRO)

vm_arch = self.osutil.get_vm_arch()
logger.info("CPU Arch: {0}", vm_arch)

os_info_msg = u"Distro: {dist_name}-{dist_ver}; "\
u"OSUtil: {util_name}; AgentService: {service_name}; "\
u"OSUtil: {util_name}; "\
u"AgentService: {service_name}; "\
u"Python: {py_major}.{py_minor}.{py_micro}; "\
u"Arch: {vm_arch}; "\
u"systemd: {systemd}; "\
u"LISDrivers: {lis_ver}; "\
u"logrotate: {has_logrotate};".format(
dist_name=DISTRO_NAME, dist_ver=DISTRO_VERSION,
util_name=type(self.osutil).__name__,
service_name=self.osutil.service_name,
py_major=PY_VERSION_MAJOR, py_minor=PY_VERSION_MINOR,
py_micro=PY_VERSION_MICRO, systemd=systemd.is_systemd(),
py_micro=PY_VERSION_MICRO, vm_arch=vm_arch, systemd=systemd.is_systemd(),
lis_ver=get_lis_version(), has_logrotate=has_logrotate()
)
logger.info(os_info_msg)
Expand All @@ -343,6 +348,7 @@ def run(self, debug=False):

# Send telemetry for the OS-specific info.
add_event(AGENT_NAME, op=WALAEventOperation.OSInfo, message=os_info_msg)
self._log_openssl_info()

#
# Perform initialization tasks
Expand Down Expand Up @@ -409,6 +415,29 @@ def run(self, debug=False):
self._shutdown()
sys.exit(0)

@staticmethod
def _log_openssl_info():
try:
version = shellutil.run_command(["openssl", "version"])
message = "OpenSSL version: {0}".format(version)
logger.info(message)
add_event(op=WALAEventOperation.OpenSsl, message=message, is_success=True)
except Exception as e:
message = "Failed to get OpenSSL version: {0}".format(e)
logger.info(message)
add_event(op=WALAEventOperation.OpenSsl, message=message, is_success=False, log_event=False)
#
# Collect telemetry about the 'pkey' command. CryptUtil get_pubkey_from_prv() uses the 'pkey' command only as a fallback after trying 'rsa'.
# 'pkey' also works for RSA keys, but it may not be available on older versions of OpenSSL. Check telemetry after a few releases and if there
# are no versions of OpenSSL that do not support 'pkey' consider removing the use of 'rsa' altogether.
#
try:
shellutil.run_command(["openssl", "help", "pkey"])
except Exception as e:
message = "OpenSSL does not support the pkey command: {0}".format(e)
logger.info(message)
add_event(op=WALAEventOperation.OpenSsl, message=message, is_success=False, log_event=False)

def _initialize_goal_state(self, protocol):
#
# Block until we can fetch the first goal state (self._try_update_goal_state() does its own logging and error handling).
Expand Down Expand Up @@ -988,13 +1017,10 @@ def _send_heartbeat_telemetry(self, protocol):
if datetime.utcnow() >= (self._last_telemetry_heartbeat + UpdateHandler.TELEMETRY_HEARTBEAT_PERIOD):
dropped_packets = self.osutil.get_firewall_dropped_packets(protocol.get_endpoint())
auto_update_enabled = 1 if conf.get_autoupdate_enabled() else 0
# Include vm architecture in the heartbeat message because the kusto table does not have
# a separate column for it.
vmarch = self._get_vm_arch()

telemetry_msg = "{0};{1};{2};{3};{4};{5}".format(self._heartbeat_counter, self._heartbeat_id, dropped_packets,
telemetry_msg = "{0};{1};{2};{3};{4}".format(self._heartbeat_counter, self._heartbeat_id, dropped_packets,
self._heartbeat_update_goal_state_error_count,
auto_update_enabled, vmarch)
auto_update_enabled)
debug_log_msg = "[DEBUG HeartbeatCounter: {0};HeartbeatId: {1};DroppedPackets: {2};" \
"UpdateGSErrors: {3};AutoUpdate: {4}]".format(self._heartbeat_counter,
self._heartbeat_id, dropped_packets,
Expand Down
3 changes: 2 additions & 1 deletion tests/common/test_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import json
import os
import platform
import re
import shutil
import threading
Expand Down Expand Up @@ -70,7 +71,7 @@ def setUp(self):
CommonTelemetryEventSchema.EventTid: threading.current_thread().ident,
CommonTelemetryEventSchema.EventPid: os.getpid(),
CommonTelemetryEventSchema.TaskName: threading.current_thread().getName(),
CommonTelemetryEventSchema.KeywordName: '',
CommonTelemetryEventSchema.KeywordName: json.dumps({"CpuArchitecture": platform.machine()}),
# common parameters computed from the OS platform
CommonTelemetryEventSchema.OSVersion: EventLoggerTools.get_expected_os_version(),
CommonTelemetryEventSchema.ExecutionMode: AGENT_EXECUTION_MODE,
Expand Down
13 changes: 13 additions & 0 deletions tests/common/utils/test_crypt_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,19 @@ def test_get_pubkey_from_crt(self):
with open(expected_pub_key) as fh:
self.assertEqual(fh.read(), crypto.get_pubkey_from_prv(prv_key))

def test_get_pubkey_from_prv(self):
crypto = CryptUtil(conf.get_openssl_cmd())

def do_test(prv_key, expected_pub_key):
prv_key = os.path.join(data_dir, "wire", prv_key)
expected_pub_key = os.path.join(data_dir, "wire", expected_pub_key)

with open(expected_pub_key) as fh:
self.assertEqual(fh.read(), crypto.get_pubkey_from_prv(prv_key))

do_test("rsa-key.pem", "rsa-key.pub.pem")
do_test("ec-key.pem", "ec-key.pub.pem")

def test_get_pubkey_from_crt_invalid_file(self):
crypto = CryptUtil(conf.get_openssl_cmd())
prv_key = os.path.join(data_dir, "wire", "trans_prv_does_not_exist")
Expand Down
Loading

0 comments on commit ceb1661

Please sign in to comment.