Skip to content

Commit

Permalink
Merge pull request #1983 from DataDog/zeller/config-reload
Browse files Browse the repository at this point in the history
[core] Hot reloading of integration configs
  • Loading branch information
JohnLZeller committed Oct 27, 2015
2 parents 34632ac + c6de730 commit 8dc6607
Showing 6 changed files with 105 additions and 27 deletions.
82 changes: 61 additions & 21 deletions agent.py
Original file line number Diff line number Diff line change
@@ -68,8 +68,14 @@ def __init__(self, pidfile, autorestart, start_event=True, in_developer_mode=Fal
self.collector = None
self.start_event = start_event
self.in_developer_mode = in_developer_mode
self._agentConfig = {}
self._checksd = []
self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL
self.check_frequency = None
self.configs_reloaded = False

def _handle_sigterm(self, signum, frame):
"""Handles SIGTERM and SIGINT, which gracefully stops the agent."""
log.debug("Caught sigterm. Stopping run loop.")
self.run_forever = False

@@ -78,9 +84,32 @@ def _handle_sigterm(self, signum, frame):
log.debug("Collector is stopped.")

def _handle_sigusr1(self, signum, frame):
"""Handles SIGUSR1, which signals an exit with an autorestart."""
self._handle_sigterm(signum, frame)
self._do_restart()

def _handle_sighup(self, signum, frame):
"""Handles SIGHUP, which signals a configuration reload."""
log.info("SIGHUP caught!")
self.reload_configs()
self.configs_reloaded = True

def reload_configs(self):
"""Reloads the agent configuration and checksd configurations."""
log.info("Attempting a configuration reload...")

# Reload checksd configs
hostname = get_hostname(self._agentConfig)
self._checksd = load_check_directory(self._agentConfig, hostname)

# Logging
num_checks = len(self._checksd['initialized_checks'])
if num_checks > 0:
log.info("Successfully reloaded {num_checks} checks".
format(num_checks=num_checks))
else:
log.info("No checksd configs found")

@classmethod
def info(cls, verbose=None):
logging.getLogger().setLevel(logging.ERROR)
@@ -98,39 +127,46 @@ def run(self, config=None):
# Handle Keyboard Interrupt
signal.signal(signal.SIGINT, self._handle_sigterm)

# A SIGHUP signals a configuration reload
signal.signal(signal.SIGHUP, self._handle_sighup)

# Save the agent start-up stats.
CollectorStatus().persist()

# Intialize the collector.
if not config:
config = get_config(parse_args=True)

agentConfig = self._set_agent_config_hostname(config)
hostname = get_hostname(agentConfig)
self._agentConfig = self._set_agent_config_hostname(config)
hostname = get_hostname(self._agentConfig)
systemStats = get_system_stats()
emitters = self._get_emitters(agentConfig)
emitters = self._get_emitters()

# Load the checks.d checks
checksd = load_check_directory(agentConfig, hostname)
self._checksd = load_check_directory(self._agentConfig, hostname)

self.collector = Collector(agentConfig, emitters, systemStats, hostname)
# Initialize the Collector
self.collector = Collector(self._agentConfig, emitters, systemStats, hostname)

# In developer mode, the number of runs to be included in a single collector profile
collector_profile_interval = agentConfig.get('collector_profile_interval',
DEFAULT_COLLECTOR_PROFILE_INTERVAL)
self.collector_profile_interval = self._agentConfig.get('collector_profile_interval',
DEFAULT_COLLECTOR_PROFILE_INTERVAL)

# Configure the watchdog.
check_frequency = int(agentConfig['check_freq'])
watchdog = self._get_watchdog(check_frequency, agentConfig)
self.check_frequency = int(self._agentConfig['check_freq'])
watchdog = self._get_watchdog(self.check_frequency)

# Initialize the auto-restarter
self.restart_interval = int(agentConfig.get('restart_interval', RESTART_INTERVAL))
self.restart_interval = int(self._agentConfig.get('restart_interval', RESTART_INTERVAL))
self.agent_start = time.time()

profiled = False
collector_profiled_runs = 0

# Run the main loop.
while self.run_forever:
log.debug("Found {num_checks} checks".format(num_checks=len(self._checksd['initialized_checks'])))

# Setup profiling if necessary
if self.in_developer_mode and not profiled:
try:
@@ -141,9 +177,13 @@ def run(self, config=None):
log.warn("Cannot enable profiler: %s" % str(e))

# Do the work.
self.collector.run(checksd=checksd, start_event=self.start_event)
self.collector.run(checksd=self._checksd,
start_event=self.start_event,
configs_reloaded=self.configs_reloaded)
if self.configs_reloaded:
self.configs_reloaded = False
if profiled:
if collector_profiled_runs >= collector_profile_interval:
if collector_profiled_runs >= self.collector_profile_interval:
try:
profiler.disable_profiling()
profiled = False
@@ -155,33 +195,33 @@ def run(self, config=None):
if self.autorestart and self._should_restart():
self._do_restart()

# Only plan for the next loop if we will continue,
# otherwise just exit quickly.
# Only plan for next loop if we will continue, otherwise exit quickly.
if self.run_forever:
if watchdog:
watchdog.reset()
if profiled:
collector_profiled_runs += 1
time.sleep(check_frequency)
log.info("Sleeping for {0} seconds".format(self.check_frequency))
time.sleep(self.check_frequency)

# Now clean-up.
try:
CollectorStatus.remove_latest_status()
except Exception:
pass

# Explicitly kill the process, because it might be running
# as a daemon.
# Explicitly kill the process, because it might be running as a daemon.
log.info("Exiting. Bye bye.")
sys.exit(0)

def _get_emitters(self, agentConfig):
def _get_emitters(self):
return [http_emitter]

def _get_watchdog(self, check_freq, agentConfig):
def _get_watchdog(self, check_freq):
watchdog = None
if agentConfig.get("watchdog", True):
if self._agentConfig.get("watchdog", True):
watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER,
max_mem_mb=agentConfig.get('limit_memory_consumption', None))
max_mem_mb=self._agentConfig.get('limit_memory_consumption', None))
watchdog.reset()
return watchdog

5 changes: 3 additions & 2 deletions checks/collector.py
Original file line number Diff line number Diff line change
@@ -249,10 +249,11 @@ def _stats_for_display(raw_stats):
return pprint.pformat(raw_stats, indent=4)

@log_exceptions(log)
def run(self, checksd=None, start_event=True):
def run(self, checksd=None, start_event=True, configs_reloaded=False):
"""
Collect data from each check and submit their data.
"""
log.debug("Found {num_checks} checks".format(num_checks=len(checksd['initialized_checks'])))
timer = Timer()
if not Platform.is_windows():
cpu_clock = time.clock()
@@ -267,7 +268,7 @@ def run(self, checksd=None, start_event=True):

# Find the AgentMetrics check and pop it out
# This check must run at the end of the loop to collect info on agent performance
if not self._agent_metrics:
if not self._agent_metrics or configs_reloaded:
for check in self.initialized_checks_d:
if check.name == AGENT_METRICS_CHECK_NAME:
self._agent_metrics = check
18 changes: 17 additions & 1 deletion packaging/centos/datadog-agent.init
Original file line number Diff line number Diff line change
@@ -25,6 +25,7 @@
AGENTPATH="/opt/datadog-agent/agent/agent.py"
AGENTCONF="/etc/dd-agent/datadog.conf"
DOGSTATSDPATH="/opt/datadog-agent/agent/dogstatsd.py"
KILL_PATH="/opt/datadog-agent/embedded/bin/kill"
AGENTUSER="dd-agent"
PIDPATH="/var/run/dd-agent/"
PROG="datadog-agent"
@@ -35,6 +36,7 @@ SUPERVISORCTL_PATH="/opt/datadog-agent/bin/supervisorctl"
SUPERVISOR_CONF="/etc/dd-agent/supervisor.conf"
SUPERVISOR_SOCK="/opt/datadog-agent/run/datadog-supervisor.sock"
SUPERVISOR_PIDFILE="/opt/datadog-agent/run/datadog-supervisord.pid"
COLLECTOR_PIDFILE="/opt/datadog-agent/run/dd-agent.pid"

# Source function library.
. /etc/rc.d/init.d/functions
@@ -150,6 +152,11 @@ info() {
exit $(($FORWARDER_RETURN+$COLLECTOR_RETURN+DOGSTATSD_RETURN))
}

reload() {
$KILL_PATH -HUP `cat $COLLECTOR_PIDFILE`
exit $?
}

configcheck() {
su $AGENTUSER -c "$AGENTPATH configcheck"
exit $?
@@ -160,12 +167,15 @@ case "$1" in
start)
start
;;

stop)
stop
;;

restart)
restart
;;

status)
# Note: sh does not support arrays
# Check for kernel version 3.18+ - overlayfs has known bug affecting unix domain sockets
@@ -185,9 +195,15 @@ case "$1" in
check_status
fi
;;

info)
info "$@"
;;

reload)
reload
;;

configcheck)
configcheck
;;
@@ -209,7 +225,7 @@ case "$1" in
;;

*)
echo "Usage: $0 {start|stop|restart|info|status|configcheck|configtest|jmx}"
echo "Usage: $0 {start|stop|restart|info|status|reload|configcheck|configtest|jmx}"
exit 2
esac
exit $?
9 changes: 8 additions & 1 deletion packaging/datadog-agent/source/agent
Original file line number Diff line number Diff line change
@@ -3,11 +3,13 @@ BASEDIR=$(dirname $0)
cd "$BASEDIR/.."

PATH=$BASEDIR/../venv/bin:$PATH
EMBEDDED_BIN_PATH="/opt/datadog-agent/embedded/bin"

SUPERVISOR_NOT_RUNNING="Supervisor is not running"
SUPERVISOR_CONF_FILE='supervisord/supervisord.conf'
SOCK_FILE='run/agent-supervisor.sock'
PID_FILE='run/supervisord.pid'
COLLECTOR_PIDFILE='run/dd-agent.pid'
action=$1

if [ ! -n "$action" ]; then
@@ -106,6 +108,11 @@ case $action in
exit $?
;;

reload)
$EMBEDDED_BIN_PATH/kill -HUP `cat $COLLECTOR_PIDFILE`
exit $?
;;

info)
shift # shift to pass the remaining arguments to agent/agent.py info.
# Currently only agent.py takes additional arguments
@@ -144,7 +151,7 @@ case $action in


*)
echo "Usage: $0 {start|stop|restart|info|status|configcheck|check|jmx}"
echo "Usage: $0 {start|stop|restart|info|status|reload|configcheck|check|jmx}"
exit 2
;;
esac
9 changes: 8 additions & 1 deletion packaging/debian/datadog-agent.init
Original file line number Diff line number Diff line change
@@ -16,6 +16,7 @@ PATH=$PATH:/sbin # add the location of start-stop-daemon on Debian
AGENTPATH="/opt/datadog-agent/agent/agent.py"
AGENTCONF="/etc/dd-agent/datadog.conf"
DOGSTATSDPATH="/opt/datadog-agent/agent/dogstatsd.py"
KILL_PATH="/opt/datadog-agent/embedded/bin/kill"
AGENTUSER="dd-agent"
FORWARDERPATH="/opt/datadog-agent/agent/ddagent.py"
NAME="datadog-agent"
@@ -25,6 +26,7 @@ SUPERVISOR_FILE="/etc/dd-agent/supervisor.conf"
SUPERVISOR_SOCK="/opt/datadog-agent/run/datadog-supervisor.sock"
SUPERVISORCTL_PATH="/opt/datadog-agent/bin/supervisorctl"
SUPERVISORD_PATH="/opt/datadog-agent/bin/supervisord"
COLLECTOR_PIDFILE="/opt/datadog-agent/run/dd-agent.pid"
SYSTEM_PATH=/opt/datadog-agent/embedded/bin:/opt/datadog-agent/bin:$PATH

if [ ! -x $AGENTPATH ]; then
@@ -154,6 +156,11 @@ case "$1" in
exit $?
;;

reload)
$KILL_PATH -HUP `cat $COLLECTOR_PIDFILE`
exit $?
;;

restart|force-reload)
$0 stop
$0 start
@@ -183,7 +190,7 @@ case "$1" in

*)
N=/etc/init.d/$NAME
echo "Usage: $N {start|stop|restart|info|status|configcheck|configtest|jmx|flare}"
echo "Usage: $N {start|stop|restart|info|status|reload|configcheck|configtest|jmx|flare}"
exit 1
;;
esac
9 changes: 8 additions & 1 deletion packaging/osx/datadog-agent
Original file line number Diff line number Diff line change
@@ -4,12 +4,14 @@ DESC="Datadog Agent"
AGENTPATH="/opt/datadog-agent/agent/agent.py"
FORWARDERPATH="/opt/datadog-agent/agent/ddagent.py"
DOGSTATSDPATH="/opt/datadog-agent/agent/dogstatsd.py"
KILL_PATH="/opt/datadog-agent/embedded/bin/kill"
AGENTCONF="/opt/datadog-agent/etc/datadog.conf"
SUPERVISOR_PIDFILE="/opt/datadog-agent/run/datadog-supervisord.pid"
SUPERVISOR_CONF_FILE="/opt/datadog-agent/etc/supervisor.conf"
SUPERVISOR_SOCK="/opt/datadog-agent/run/datadog-supervisor.sock"
SUPERVISORCTL_PATH="/opt/datadog-agent/bin/supervisorctl"
SUPERVISORD_PATH="/opt/datadog-agent/bin/supervisord"
COLLECTOR_PIDFILE="/opt/datadog-agent/run/dd-agent.pid"

# be sure to remove PY2APP parameters
unset PYTHONHOME
@@ -95,6 +97,11 @@ case $1 in
exit $?
;;

reload)
$KILL_PATH -HUP `cat $COLLECTOR_PIDFILE`
exit $?
;;

info)
shift # shift to pass the remaining arguments to agent/agent.py info.
# Currently only agent.py takes additional arguments
@@ -131,7 +138,7 @@ case $1 in
;;

*)
echo "Usage: $0 {start|stop|restart|info|status|configcheck|check|flare|jmx}"
echo "Usage: $0 {start|stop|restart|info|status|reload|configcheck|check|flare|jmx}"
exit 2
;;
esac

0 comments on commit 8dc6607

Please sign in to comment.