Skip to content

Commit

Permalink
NAS-128704 / 13.3 / Fix truecommand issues on HA in CORE 13 (#13756)
Browse files Browse the repository at this point in the history
  • Loading branch information
sonicaj authored May 20, 2024
1 parent ad0feb7 commit 148e6e6
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1845,6 +1845,7 @@ async def service_remote(middleware, service, verb, options):
'smartd',
'system_datasets',
'nfs',
'truecommand',
) or await middleware.call('failover.status') != 'MASTER':
return
# Nginx should never be stopped on standby node
Expand Down
7 changes: 6 additions & 1 deletion src/middlewared/middlewared/etc_files/local/nginx/nginx.conf
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,12 @@
ip6_list = [f'[{ip}]' for ip in ip6_list]
wg_config = middleware.call_sync('datastore.config', 'system.truecommand')
if middleware.call_sync('truecommand.connected')['connected'] and wg_config['wg_address']:
if middleware.call_sync('failover.is_single_master_node') and wg_config['api_key_state'] == 'CONNECTED' and wg_config['wg_address']:
# We use api key state to determine connected because sometimes when nginx config is reloaded
# it is not necessary that health of wireguard connection has been established at that point
# and another reload of nginx config is required then at that point then which is redundant
# An example is that when failover takes place, system knows it is master now but wireguard health hasn't
# been established at this point and we miss out on adding wireguard address to listen directive
ip4_list.append(ipaddress.ip_network(wg_config['wg_address'], False).network_address)

ip_list = ip4_list + ip6_list
Expand Down
8 changes: 7 additions & 1 deletion src/middlewared/middlewared/plugins/truecommand/portal.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,17 @@ async def poll_api_for_status(self, job):
)
if status.get('tc_state') == 'running':
await self.middleware.call('truecommand.dismiss_alerts')
await self.middleware.call('truecommand.start_truecommand_service')
else:
await self.middleware.call('truecommand.dismiss_alerts', True)
await self.middleware.call('alert.oneshot_create', 'TruecommandContainerHealth', None)

await self.middleware.call('truecommand.start_truecommand_service')
asyncio.get_event_loop().call_later(
self.POLLING_GAP_MINUTES * 60,
lambda: self.middleware.create_task(
self.middleware.call('truecommand.start_truecommand_service')
),
)

break

Expand Down
8 changes: 6 additions & 2 deletions src/middlewared/middlewared/plugins/truecommand/wireguard.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import asyncio
import re
import subprocess
import time
Expand Down Expand Up @@ -128,8 +129,11 @@ async def start_truecommand_service(self):
config[k] for k in ('wg_private_key', 'remote_address', 'endpoint', 'tc_public_key', 'wg_address')
):
await self.middleware.call('service.start', 'truecommand')
await self.middleware.call('service.reload', 'http')
self.middleware.create_task(self.middleware.call('truecommand.health_check'))
await self.middleware.call('service.reload', 'http', {'ha_propagate': False})
asyncio.get_event_loop().call_later(
30, # 30 seconds is enough time to initiate a health check to see if the connection is alive
lambda: self.middleware.create_task(self.middleware.call('truecommand.health_check')),
)
else:
# start polling iX Portal to see what's up and why we don't have these values set
# This can happen in instances where system was polling and then was rebooted,
Expand Down

0 comments on commit 148e6e6

Please sign in to comment.