From e218f53e86618d2a004603d2d9187bad64ba9580 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ramiro=20Gonz=C3=A1lez=20Maciel?= Date: Wed, 10 Aug 2022 09:23:21 -0300 Subject: [PATCH] Add handover config for api --- apps/server/lib/bootstrap.ts | 18 +++-- apps/server/lib/index.ts | 131 +++++++++++++++++++++++++++++----- apps/server/package-lock.json | 26 ++++++- apps/server/package.json | 3 +- docker-compose.yml | 4 ++ 5 files changed, 156 insertions(+), 26 deletions(-) diff --git a/apps/server/lib/bootstrap.ts b/apps/server/lib/bootstrap.ts index 1864fe9da56..5f97d144e8f 100644 --- a/apps/server/lib/bootstrap.ts +++ b/apps/server/lib/bootstrap.ts @@ -81,9 +81,19 @@ export const bootstrap = async (logContext: LogContext, options: any) => { const closeWorker = async () => { await worker.stop(); - await kernel.disconnect(logContext); + try { + await kernel.disconnect(logContext); + } catch (error) { + logger.warn(logContext, `Error when disconnecting the kernel`, { error }); + } if (cache) { - await cache.disconnect(); + try { + await cache.disconnect(); + } catch (error) { + logger.warn(logContext, `Error when disconnecting the cache`, { + error, + }); + } } }; @@ -274,10 +284,6 @@ export const bootstrap = async (logContext: LogContext, options: any) => { metricsServer.close(); await webServer.stop(); await closeWorker(); - await kernel.disconnect(logContext); - if (cache) { - await cache.disconnect(); - } }, }; }; diff --git a/apps/server/lib/index.ts b/apps/server/lib/index.ts index 9b04ea331cd..37169a03894 100644 --- a/apps/server/lib/index.ts +++ b/apps/server/lib/index.ts @@ -1,11 +1,12 @@ import { defaultEnvironment as environment } from '@balena/jellyfish-environment'; import { getLogger } from '@balena/jellyfish-logger'; -import { v4 as uuidv4 } from 'uuid'; import { bootstrap } from './bootstrap'; import { getPlugins } from './plugins'; import cluster from 'node:cluster'; -import { cpus } from 'node:os'; +import { cpus, networkInterfaces } from 'node:os'; import process from 'node:process'; +import { HandoverPeer } from 'handover-lib'; +import _ from 'lodash'; // Avoid including package.json in the build output! // tslint:disable-next-line: no-var-requires @@ -20,8 +21,17 @@ if (MAX_WORKERS) { numCPUs = Math.min(numCPUs, parseInt(MAX_WORKERS, 10)); } +const serverId = Math.round(Math.random() * 1000); +let hostId = environment.pod.name; // on a docker-compose would be 'localhost' +if (hostId === 'localhost') { + const localAddresses = networkInterfaces()?.eth0; + if (localAddresses && localAddresses[0] && localAddresses[0].address) { + hostId = localAddresses[0].address; + } +} + const DEFAULT_CONTEXT = { - id: `SERVER-ERROR-${environment.pod.name}-${packageJSON.version}`, + id: `SERVER-ERROR-${process.pid}-${environment.pod.name}-${packageJSON.version}`, }; const onError = (error, message = 'Server error', ctx = DEFAULT_CONTEXT) => { @@ -42,35 +52,112 @@ process.on('unhandledRejection', (error) => { }); const startDate = new Date(); - const run = async () => { if (cluster.isPrimary) { + const context = { + id: `SERVER-ID-${'[' + serverId + ']'}-PID-${process.pid}-${hostId}-${ + packageJSON.version + }-primary`, + }; + const handoverPeer = new HandoverPeer(startDate, context); + logger.info( - DEFAULT_CONTEXT, + context, `Primary worker started, spawning ${numCPUs} workers`, { time: startDate.getTime(), }, ); + let activeWorkers = 0; + const maxWorkers = numCPUs; + // Wait until all workers are ready + const workersNeeded = maxWorkers; // Fork workers. - for (let i = 0; i < numCPUs; i++) { - const startWorker = () => { - cluster.fork().on('exit', startWorker); - }; - startWorker(); + for (let i = 0; i < maxWorkers; i++) { + cluster.fork(); } - cluster.on('exit', (worker, code, signal) => { - logger.warn(DEFAULT_CONTEXT, `worker ${worker.process.pid} died`, { - code, - signal, - }); + if (worker.exitedAfterDisconnect === true) { + logger.info( + context, + `worker ${worker?.process?.pid} exited (${signal || code}).`, + ); + } else { + logger.info( + context, + `worker ${worker?.process?.pid} died (${ + signal || code + }). Forking again`, + ); + cluster.fork(); + } }); + + cluster.on('online', (worker) => { + logger.debug( + context, + `Worker ${worker.id} responded after it was forked`, + ); + }); + + logger.info( + context, + `PID: ${process.pid}. Waiting for ${workersNeeded} workers to start`, + ); + + cluster.on('message', (worker, message) => { + if (message?.msg === 'worker-started') { + activeWorkers++; + logger.info( + context, + `Worker ${worker.id} worker-started. activeWorkers ${activeWorkers}`, + ); + if (activeWorkers === workersNeeded) { + logger.info(context, `All ${workersNeeded} needed workers started.`); + handoverPeer.startBroadcasting(); + } + } else if (message?.msg === 'DONE') { + // Ignored, is handled by the shutdown code + } else { + logger.warn(context, `Unknown message received from worker`, message); + } + }); + + // handoverPeer will call this function when we're shutting down. It will `await` until it returns to signal that the handover is done. + // Reminder that the handover ( new instance takes the hostname ) is not performed until the old container is killed, so we have to balance between clean-shutdown and almost-zero-downtime + // Here we go with the "almost-zero-downtime": we forward the message to all workers and return without waiting for their response. + const shutdownCallback = async () => { + let exitedWorkers = 0; + const liveWorkers = Object.values(cluster.workers || {}); + for (const worker of liveWorkers) { + worker?.on('message', (message) => { + if (message?.msg === 'DONE') { + exitedWorkers++; + } + }); + worker?.send('SHUTDOWN'); + } + // Keep loggins while we're alive + setImmediate(async () => { + while (exitedWorkers < liveWorkers.length) { + logger.info( + context, + `exitedWorkers: ${exitedWorkers} of ${liveWorkers.length}`, + ); + await new Promise((r) => setTimeout(r, 100)); + } + logger.info( + context, + `All workers exited. ExitedWorkers: ${exitedWorkers} of ${liveWorkers.length}`, + ); + }); + }; + + handoverPeer.startListening(shutdownCallback); } else { - const id = uuidv4(); const context = { - id: `SERVER-${packageJSON.version}-${environment.pod.name}-worker#${cluster.worker?.id}-${id}`, + id: `SERVER-ID-${'[' + serverId + ']'}-worker#${cluster.worker?.id}`, }; logger.info(context, `Starting server with worker ${cluster.worker?.id}`, { @@ -95,6 +182,16 @@ const run = async () => { process.send!({ msg: 'worker-started' }); + cluster.worker?.on('message', async (msg) => { + if (msg === 'SHUTDOWN') { + await server.close(); + logger.info(context, `${cluster.worker?.id}:Server stopped`); + process.send!({ msg: 'DONE' }); + // bye + setTimeout(() => cluster.worker?.kill(), 100); + } + }); + if (timeToStart > 10000) { logger.warn(context, 'Slow server startup time', { time: timeToStart, diff --git a/apps/server/package-lock.json b/apps/server/package-lock.json index ffc66300d5f..a3eb6eccdf2 100644 --- a/apps/server/package-lock.json +++ b/apps/server/package-lock.json @@ -1,12 +1,12 @@ { "name": "jellyfish-server", - "version": "0.0.1", + "version": "0.0.2", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "jellyfish-server", - "version": "0.0.1", + "version": "0.0.2", "license": "AGPL-3.0", "dependencies": { "@balena/jellyfish-assert": "^1.2.50", @@ -31,6 +31,7 @@ "errio": "^1.2.2", "express": "^4.18.1", "express-basic-auth": "^1.2.1", + "handover-lib": "^0.0.10", "is-uuid": "^1.0.2", "json-e": "^4.4.3", "jsonwebtoken": "^8.5.1", @@ -5672,6 +5673,18 @@ "node": ">=10" } }, + "node_modules/handover-lib": { + "version": "0.0.10", + "resolved": "https://registry.npmjs.org/handover-lib/-/handover-lib-0.0.10.tgz", + "integrity": "sha512-2quUpCG9NUY1Rz2CpINsdgWFxAliMFLzZuWqcy58ZwQ/504D119g+3mx8KyregeiKAF8f/qYryA5yjfPtCmTmQ==", + "dependencies": { + "@balena/jellyfish-logger": "^5.1.7", + "uuid": "^8.3.2" + }, + "engines": { + "node": ">=16.0.0" + } + }, "node_modules/har-schema": { "version": "2.0.0", "license": "ISC", @@ -16130,6 +16143,15 @@ } } }, + "handover-lib": { + "version": "0.0.10", + "resolved": "https://registry.npmjs.org/handover-lib/-/handover-lib-0.0.10.tgz", + "integrity": "sha512-2quUpCG9NUY1Rz2CpINsdgWFxAliMFLzZuWqcy58ZwQ/504D119g+3mx8KyregeiKAF8f/qYryA5yjfPtCmTmQ==", + "requires": { + "@balena/jellyfish-logger": "^5.1.7", + "uuid": "^8.3.2" + } + }, "har-schema": { "version": "2.0.0" }, diff --git a/apps/server/package.json b/apps/server/package.json index 19ac85d2b40..1c49d53778c 100644 --- a/apps/server/package.json +++ b/apps/server/package.json @@ -1,6 +1,6 @@ { "name": "jellyfish-server", - "version": "0.0.1", + "version": "0.0.4", "codename": "snappy", "private": true, "description": "The Jellyfish Server App", @@ -44,6 +44,7 @@ "errio": "^1.2.2", "express": "^4.18.1", "express-basic-auth": "^1.2.1", + "handover-lib": "^0.0.10", "is-uuid": "^1.0.2", "json-e": "^4.4.3", "jsonwebtoken": "^8.5.1", diff --git a/docker-compose.yml b/docker-compose.yml index 3c84c8d0d6c..a30caa82621 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -85,6 +85,10 @@ services: labels: io.balena.features.supervisor-api: 1 io.balena.features.balena-api: 1 + io.balena.update.strategy: hand-over + # Note that this timer starts to run after the new node is started, so it needs to give enough time for + # the new service to be ready to handle requests + io.balena.update.handover-timeout: 120000 volumes: - resin-data:/balena - certs-data:/certs