Skip to content

Commit

Permalink
Recover rsyslog from 4xx error
Browse files Browse the repository at this point in the history
Due to ansible#7560

'omhttp' module for rsyslog will completely stop forwarding message to external log aggregator after receiving a 4xx error from the external log aggregator

This PR is an "workaround" for this problem by restarting rsyslogd after detecting that rsyslog received a 4xx error
  • Loading branch information
TheRealHaoLiu committed Dec 11, 2023
1 parent 5e48bf0 commit 85e9e02
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 22 deletions.
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ recursive-exclude awx/settings local_settings.py*
include tools/scripts/request_tower_configuration.sh
include tools/scripts/request_tower_configuration.ps1
include tools/scripts/automation-controller-service
include tools/scripts/failure-event-handler
include tools/scripts/rsyslog-4xx-recovery
include tools/scripts/awx-python
include awx/playbooks/library/mkfifo.py
include tools/sosreport/*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ pidfile = /var/run/supervisor/supervisor.rsyslog.pid
[program:awx-rsyslogd]
command = rsyslogd -n -i /var/run/awx-rsyslog/rsyslog.pid -f /var/lib/awx/rsyslog/rsyslog.conf
autorestart = true
startsecs = 30
startsecs = 0
stopasgroup=true
killasgroup=true
stdout_logfile=/dev/stdout
Expand Down Expand Up @@ -59,6 +59,15 @@ stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0

[eventlistener:rsyslog-4xx-recovery]
command=/awx_devel/tools/scripts/rsyslog-4xx-recovery
buffer_size = 100
events=PROCESS_LOG_STDERR
priority=0
autorestart=true
stdout_events_enabled = true
stderr_events_enabled = true

[unix_http_server]
file=/var/run/supervisor/supervisor.rsyslog.sock

Expand Down
11 changes: 10 additions & 1 deletion tools/docker-compose/supervisor.conf
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,9 @@ stderr_events_enabled = true
[program:awx-rsyslogd]
command = rsyslogd -n -i /var/run/awx-rsyslog/rsyslog.pid -f /var/lib/awx/rsyslog/rsyslog.conf
autorestart = true
startsecs=0
stopasgroup=true
killasgroup=true
redirect_stderr=true
stdout_events_enabled = true
stderr_events_enabled = true

Expand Down Expand Up @@ -119,6 +119,15 @@ events=PROCESS_STATE_FATAL
autorestart = true
stderr_logfile=/dev/stdout

[eventlistener:rsyslog-4xx-recovery]
command=/awx_devel/tools/scripts/rsyslog-4xx-recovery
buffer_size = 100
events=PROCESS_LOG_STDERR
priority=0
autorestart=true
stdout_events_enabled = true
stderr_events_enabled = true

[unix_http_server]
file=/var/run/supervisor/supervisor.sock

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,10 @@ def write_stdout(s):
sys.stdout.write(s)
sys.stdout.flush()


def write_stderr(s):
sys.stderr.write(s)
sys.stderr.flush()


def main():
while 1:
write_stdout("READY\n")
Expand All @@ -31,23 +29,6 @@ def main():
except ValueError as e:
write_stderr(str(e))

# now decide what do to based on eventnames
if headers["eventname"] == "PROCESS_STATE_FATAL":
headers.update(
dict(
[x.split(":") for x in sys.stdin.read(int(headers["len"])).split()]
)
)

try:
# incoming event that produced PROCESS_STATE_FATAL will have a PID. SIGTERM it!
write_stderr(
f"{datetime.datetime.now(timezone.utc)} - sending SIGTERM to proc={headers} with data={headers}\n"
)
os.kill(headers["pid"], signal.SIGTERM)
except Exception as e:
write_stderr(str(e))

# awx-rsyslog PROCESS_LOG_STDERR handler
if headers["eventname"] == "PROCESS_LOG_STDERR":
# pertinent data to process that produced PROCES_LOG_STDERR is in the first line of the data payload; so lets extract it
Expand Down

0 comments on commit 85e9e02

Please sign in to comment.