From 27df02534b7e6f5bb59ab3d296078decaa5d93e7 Mon Sep 17 00:00:00 2001 From: Drew Kerrigan Date: Mon, 9 Oct 2017 17:41:45 -0400 Subject: [PATCH] Add container syslogd support and fix 'Waiting for Pids' race condition (#505) * Add additional debug logs in reload * Adding syslog support * Make syslogd and retry reload configurable * Update documentation * Make infinite retries optional, run syslogd with runsvdir --- Dockerfile | 1 + Longhelp.md | 13 ++++++++++-- README.md | 10 +++++++++ haproxy_wrapper.py | 20 ++++++++++++++++- marathon_lb.py | 52 ++++++++++++++++++++++++++++++++++++++++----- run | 32 +++++++++++++++++++++++----- service/haproxy/run | 34 +++++++++++++++++++++++------ syslogd/run | 9 ++++++++ syslogd/syslog.conf | 1 + 9 files changed, 153 insertions(+), 19 deletions(-) create mode 100755 syslogd/run create mode 100644 syslogd/syslog.conf diff --git a/Dockerfile b/Dockerfile index e8e91a7a..d9f5a3c2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,6 +3,7 @@ FROM debian:stretch # runtime dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ ca-certificates \ + inetutils-syslogd \ iptables \ libcurl3 \ liblua5.3-0 \ diff --git a/Longhelp.md b/Longhelp.md index bf8a7b57..1ccc9a95 100644 --- a/Longhelp.md +++ b/Longhelp.md @@ -18,8 +18,10 @@ Marathon-lb just needs to know where to find Marathon. ``` usage: marathon_lb.py [-h] [--longhelp] [--marathon MARATHON [MARATHON ...]] [--haproxy-config HAPROXY_CONFIG] [--group GROUP] - [--command COMMAND] [--strict-mode] [--sse] - [--health-check] + [--command COMMAND] + [--max-reload-retries MAX_RELOAD_RETRIES] + [--reload-interval RELOAD_INTERVAL] [--strict-mode] + [--sse] [--health-check] [--lru-cache-capacity LRU_CACHE_CAPACITY] [--haproxy-map] [--dont-bind-http-https] [--ssl-certs SSL_CERTS] [--skip-validation] [--dry] @@ -51,6 +53,13 @@ optional arguments: --command COMMAND, -c COMMAND If set, run this command to reload haproxy. (default: None) + --max-reload-retries MAX_RELOAD_RETRIES + Max reload retries before failure. Reloads happen + every --reload-interval seconds. Set to 0 to disable + or -1 for infinite retries. (default: 10) + --reload-interval RELOAD_INTERVAL + Wait this number of seconds betwee nreload retries. + (default: 10) --strict-mode If set, backends are only advertised if HAPROXY_{n}_ENABLED=true. Strict mode will be enabled by default in a future release. (default: False) diff --git a/README.md b/README.md index 4c3bb8ab..2c5d6f6c 100644 --- a/README.md +++ b/README.md @@ -288,6 +288,16 @@ The default value when not specified is `redispatch,http-server-close,dontlognul < HTTP/1.1 200 OK ``` * Some of the features of marathon-lb assume that it is the only instance of itself running in a PID namespace. i.e. marathon-lb assumes that it is running in a container. Certain features like the `/_mlb_signal` endpoints and the `/_haproxy_getpids` endpoint (and by extension, zero-downtime deployments) may behave unexpectedly if more than one instance of marathon-lb is running in the same PID namespace or if there are other HAProxy processes in the same PID namespace. + * Sometimes it is desirable to get detailed container and HAProxy logging for easier debugging as well as viewing connection logging to frontends and backends. This can be achieved by setting the `HAPROXY_SYSLOGD` environment variable or `container-syslogd` value in `options.json` like so: + + ```json + { + "marathon-lb": { + "container-syslogd": true + } + } + ``` + ## Zero-downtime Deployments diff --git a/haproxy_wrapper.py b/haproxy_wrapper.py index 6035047f..e8698df1 100755 --- a/haproxy_wrapper.py +++ b/haproxy_wrapper.py @@ -3,35 +3,53 @@ import sys import time import errno +import logging + +logger = logging.getLogger('haproxy_wrapper') +logger.setLevel(getattr(logging, 'DEBUG')) +formatter = logging.Formatter("%(asctime)-15s %(name)s: %(message)s") +consoleHandler = logging.StreamHandler() +consoleHandler.setFormatter(formatter) +logger.addHandler(consoleHandler) def create_haproxy_pipe(): + logger.debug("create_haproxy_pipe called") pipefd = os.pipe() os.set_inheritable(pipefd[0], True) os.set_inheritable(pipefd[1], True) + logger.debug("create_haproxy_pipe done") return pipefd def close_and_swallow(fd): + logger.debug("close_and_swallow called") try: os.close(fd) - except OSError: + logger.debug("close_and_swallow successful") + except OSError as e: # swallow + logger.debug("close_and_swallow swallow OSError: %s", e) pass def wait_on_haproxy_pipe(pipefd): + logger.debug("wait_on_haproxy_pipe called") try: ret = os.read(pipefd[0], 1) if len(ret) == 0: close_and_swallow(pipefd[0]) close_and_swallow(pipefd[1]) + logger.debug("wait_on_haproxy_pipe done (False)") return False except OSError as e: + logger.debug("wait_on_haproxy_pipe OSError: %s", e) if e.args[0] != errno.EINTR: close_and_swallow(pipefd[0]) close_and_swallow(pipefd[1]) + logger.debug("wait_on_haproxy_pipe done (False)") return False + logger.debug("wait_on_haproxy_pipe done (True)") return True diff --git a/marathon_lb.py b/marathon_lb.py index c72dbb93..d045ea47 100755 --- a/marathon_lb.py +++ b/marathon_lb.py @@ -615,7 +615,8 @@ def get_haproxy_pids(): "pidof haproxy", stderr=subprocess.STDOUT, shell=True).split())) - except subprocess.CalledProcessError: + except subprocess.CalledProcessError as ex: + logger.debug("Unable to get haproxy pids: %s", ex) return set() @@ -646,14 +647,46 @@ def reloadConfig(): logger.info("reloading using %s", " ".join(reloadCommand)) try: start_time = time.time() + checkpoint_time = start_time + # Retry or log the reload every 10 seconds + reload_frequency = args.reload_interval + reload_retries = args.max_reload_retries + enable_retries = True + infinite_retries = False + if reload_retries == 0: + enable_retries = False + elif reload_retries < 0: + infinite_retries = True old_pids = get_haproxy_pids() subprocess.check_call(reloadCommand, close_fds=True) + new_pids = get_haproxy_pids() + logger.debug("Waiting for new haproxy pid (old pids: [%s], " + + "new_pids: [%s])...", old_pids, new_pids) # Wait until the reload actually occurs and there's a new PID - while len(get_haproxy_pids() - old_pids) < 1: - logger.debug("Waiting for new haproxy pid...") + while True: + if len(new_pids - old_pids) >= 1: + logger.debug("new pids: [%s]", new_pids) + logger.debug("reload finished, took %s seconds", + time.time() - start_time) + break + timeSinceCheckpoint = time.time() - checkpoint_time + if (timeSinceCheckpoint >= reload_frequency): + logger.debug("Still waiting for new haproxy pid after " + + "%s seconds (old pids: [%s], " + + "new_pids: [%s]).", + time.time() - start_time, old_pids, new_pids) + checkpoint_time = time.time() + if enable_retries: + if not infinite_retries: + reload_retries -= 1 + if reload_retries == 0: + logger.debug("reload failed after %s seconds", + time.time() - start_time) + break + logger.debug("Attempting reload again...") + subprocess.check_call(reloadCommand, close_fds=True) time.sleep(0.1) - logger.debug("reload finished, took %s seconds", - time.time() - start_time) + new_pids = get_haproxy_pids() except OSError as ex: logger.error("unable to reload config using command %s", " ".join(reloadCommand)) @@ -1602,6 +1635,15 @@ def get_arg_parser(): parser.add_argument("--command", "-c", help="If set, run this command to reload haproxy.", default=None) + parser.add_argument("--max-reload-retries", + help="Max reload retries before failure. Reloads" + " happen every --reload-interval seconds. Set to" + " 0 to disable or -1 for infinite retries.", + type=int, default=10) + parser.add_argument("--reload-interval", + help="Wait this number of seconds between" + " reload retries.", + type=int, default=10) parser.add_argument("--strict-mode", help="If set, backends are only advertised if" " HAPROXY_{n}_ENABLED=true. Strict mode will be" diff --git a/run b/run index 8501f821..eaf0f52a 100755 --- a/run +++ b/run @@ -1,6 +1,24 @@ #!/bin/bash set -euo pipefail +LOG_PREFIX="$(pwd) $0" +log() { + logline="[$LOG_PREFIX] $1\n" + printf "$logline" >&1 +} +log_error() { + logline="[$LOG_PREFIX] $1\n" + printf "$logline" >&1 + printf "$logline" >&2 +} + +if [ -n "${HAPROXY_SYSLOGD-}" ]; then + SYSLOGD_SERVICE="/marathon-lb/service/syslogd" + mkdir -p $SYSLOGD_SERVICE + cp /marathon-lb/syslogd/run "$SYSLOGD_SERVICE/" +fi + +# Custom syslog socket for marathon-lb.py logging SYSLOG_SOCKET=${SYSLOG_SOCKET:-/dev/null} LB_SERVICE="/marathon-lb/service/lb" @@ -10,9 +28,9 @@ HAPROXY_SERVICE="/marathon-lb/service/haproxy" mkdir -p $HAPROXY_SERVICE/env if [ -n "${PORTS-}" ]; then - echo $PORTS > $HAPROXY_SERVICE/env/PORTS + log "$PORTS > $HAPROXY_SERVICE/env/PORTS" else - echo 'Define $PORTS with a comma-separated list of ports to which HAProxy binds' >&2 + log_error "Define $PORTS with a comma-separated list of ports to which HAProxy binds" exit 1 fi @@ -64,7 +82,7 @@ if [ -n "${MESOS_SANDBOX-}" ] && [ -d "$MESOS_SANDBOX/templates" ]; then fi if [ -n "${HAPROXY_SYSCTL_PARAMS-}" ]; then - echo "setting sysctl params to: ${HAPROXY_SYSCTL_PARAMS}" + log "setting sysctl params to: ${HAPROXY_SYSCTL_PARAMS}" if [ -n "${HAPROXY_SYSCTL_NONSTRICT-}" ]; then # ignore errors sysctl -w $HAPROXY_SYSCTL_PARAMS || true @@ -83,12 +101,12 @@ case "$MODE" in ARGS="--sse" ;; *) - echo "Unknown mode $MODE. Synopsis: $0 poll|sse [marathon_lb.py args]" >&2 + log_error "Unknown mode $MODE. Synopsis: $0 poll|sse [marathon_lb.py args]" exit 1 ;; esac -if [ -n "${VAULT_TOKEN-}" ] && [ -n "${VAULT_HOST-}" ] && [ -n "${VAULT_PORT-}" ] && [ -n "${VAULT_PATH-}" ]; then +if [ -n "${VAULT_TOKEN-}" ] && [ -n "${VAULT_HOST-}" ] && [ -n "${VAULT_PORT-}" ] && [ -n "${VAULT_PATH-}" ]; then MARATHON_LB_PASSWORD=$(curl -k -L -H "X-Vault-Token:$VAULT_TOKEN" "$VAULT_URL" -s| python -m json.tool | python -c 'import json,sys;obj=json.load(sys.stdin);print obj["data"]["pass"]') MARATHON_LB_USER=$(curl -k -L -H "X-Vault-Token:$VAULT_TOKEN" "$VAULT_URL" -s | python -m json.tool | python -c 'import json,sys;obj=json.load(sys.stdin);print obj["data"]["user"]') CREDENTIALS="$MARATHON_LB_USER:$MARATHON_LB_PASSWORD" @@ -113,6 +131,10 @@ exec /marathon-lb/marathon_lb.py \ EOF chmod 755 $LB_SERVICE/run +log "Created $LB_SERVICE/run with contents:" +LB_RUN=$(cat $LB_SERVICE/run) +log "$LB_RUN" + if [ "${MODE}" == "poll" ]; then cat > $LB_SERVICE/finish << EOF diff --git a/service/haproxy/run b/service/haproxy/run index 82b1772f..b902f24d 100755 --- a/service/haproxy/run +++ b/service/haproxy/run @@ -2,6 +2,17 @@ exec 2>&1 export PIDFILE="/tmp/haproxy.pid" +LOG_PREFIX="$(pwd) $0" +log() { + logline="[$LOG_PREFIX] $1\n" + printf "$logline" >&1 +} +log_error() { + logline="[$LOG_PREFIX] $1\n" + printf "$logline" >&1 + printf "$logline" >&2 +} + addFirewallRules() { IFS=',' read -ra ADDR <<< "$PORTS" for i in "${ADDR[@]}"; do @@ -17,33 +28,44 @@ removeFirewallRules() { } reload() { - echo "Reloading haproxy" + log "Reloading haproxy" ( flock 200 - # Begin to drop SYN packets with firewall rules + log "Dropping SYN packets with addFirewallRules" addFirewallRules # Wait to settle sleep 0.1 + log "addFirewallRules done" - # Save the current HAProxy state + log "Saving the current HAProxy state" socat /var/run/haproxy/socket - <<< "show servers state" > /var/state/haproxy/global + log "Done saving the current HAProxy state" # Trigger reload LATEST_HAPROXY_PID=$(cat $PIDFILE) - /marathon-lb/haproxy_wrapper.py `which haproxy` -D -p $PIDFILE -f /marathon-lb/haproxy.cfg -sf $LATEST_HAPROXY_PID 200>&- + log "LATEST_HAPROXY_PID: [$LATEST_HAPROXY_PID]" + + WHICH_HAPROXY=$(which haproxy) + + log "/marathon-lb/haproxy_wrapper.py $WHICH_HAPROXY -D -p $PIDFILE -f /marathon-lb/haproxy.cfg -sf $LATEST_HAPROXY_PID 200>&-" + /marathon-lb/haproxy_wrapper.py $WHICH_HAPROXY -D -p $PIDFILE -f /marathon-lb/haproxy.cfg -sf $LATEST_HAPROXY_PID 200>&- local exit_code=$? + log "exit code: $exit_code" if [ $exit_code -ne 0 ]; then - echo "HAProxy reload failed" 1>&2 + log_error "HAProxy reload failed" fi - # Remove the firewall rules + log "Removing firewall rules with removeFirewallRules" removeFirewallRules + log "removeFirewallRules done" # Need to wait 1s to prevent TCP SYN exponential backoff sleep 1 + + log "Reload finished" ) 200>/var/run/haproxy/lock } diff --git a/syslogd/run b/syslogd/run new file mode 100755 index 00000000..396960eb --- /dev/null +++ b/syslogd/run @@ -0,0 +1,9 @@ +#!/bin/bash +exec 2>&1 + +# A hack to redirect syslog to stdout. +# This is necessary because haproxy only logs to syslog. +ln -sf /proc/$$/fd/1 /var/log/container_stdout + +# Start syslog in foreground +/usr/sbin/syslogd -f "/marathon-lb/syslogd/syslog.conf" -n diff --git a/syslogd/syslog.conf b/syslogd/syslog.conf new file mode 100644 index 00000000..79d7bf98 --- /dev/null +++ b/syslogd/syslog.conf @@ -0,0 +1 @@ +*.*;auth,authpriv.none -/var/log/container_stdout