From 9050681fbebebf97ce6a36a82b9594b95bbcf709 Mon Sep 17 00:00:00 2001 From: Mathieu Hofman <86499+mhofman@users.noreply.github.com> Date: Tue, 13 Dec 2022 17:44:18 -0800 Subject: [PATCH] fix(swingset): workaround XS garbage collection bugs (#6664) Workaround for https://github.com/Agoric/agoric-sdk/issues/6588 During transcript replay, handle divergent syscalls which retrieve stable Virtual Collection metadata. These can happen at different times than recorded during the transcript because of some bugs in the XS engine making GC sensitive to reload from heap snapshots. cherry picked from commit 12e97f1e7d384c11c76b61de67ee0de30de00fe6 Merging into master after further consideration --- .../src/kernel/vat-loader/manager-helper.js | 8 +- .../vat-loader/manager-subprocess-xsnap.js | 3 +- .../src/kernel/vat-loader/transcript.js | 120 +++++++++++++++++- 3 files changed, 121 insertions(+), 10 deletions(-) diff --git a/packages/SwingSet/src/kernel/vat-loader/manager-helper.js b/packages/SwingSet/src/kernel/vat-loader/manager-helper.js index 82b811eed899..c8314d81323d 100644 --- a/packages/SwingSet/src/kernel/vat-loader/manager-helper.js +++ b/packages/SwingSet/src/kernel/vat-loader/manager-helper.js @@ -101,7 +101,7 @@ import { makeTranscriptManager } from './transcript.js'; * @param {KernelSlog} kernelSlog * @param {(vso: VatSyscallObject) => VatSyscallResult} vatSyscallHandler * @param {boolean} workerCanBlock - * @param {(vatID: any, originalSyscall: any, newSyscall: any) => Error | undefined} [compareSyscalls] + * @param {(vatID: any, originalSyscall: any, newSyscall: any) => import('./transcript.js').CompareSyscallsResult} [compareSyscalls] * @param {boolean} [useTranscript] * @returns {ManagerKit} */ @@ -241,7 +241,9 @@ function makeManagerKit( // but if the puppy deviates one inch from previous twitches, explode kernelSlog.syscall(vatID, undefined, vso); const vres = transcriptManager.simulateSyscall(vso); - return vres; + if (vres) { + return vres; + } } const vres = vatSyscallHandler(vso); @@ -250,7 +252,7 @@ function makeManagerKit( if (successFlag === 'ok' && data && !workerCanBlock) { console.log(`warning: syscall returns data, but worker cannot get it`); } - if (transcriptManager) { + if (transcriptManager && !transcriptManager.inReplay()) { transcriptManager.addSyscall(vso, vres); } return vres; diff --git a/packages/SwingSet/src/kernel/vat-loader/manager-subprocess-xsnap.js b/packages/SwingSet/src/kernel/vat-loader/manager-subprocess-xsnap.js index b0f20aaa14f3..824112fc0f2e 100644 --- a/packages/SwingSet/src/kernel/vat-loader/manager-subprocess-xsnap.js +++ b/packages/SwingSet/src/kernel/vat-loader/manager-subprocess-xsnap.js @@ -1,6 +1,7 @@ import { assert, Fail, q } from '@agoric/assert'; import { ExitCode } from '@agoric/xsnap/api.js'; import { makeManagerKit } from './manager-helper.js'; +import { requireIdenticalExceptStableVCSyscalls } from './transcript.js'; import { insistVatSyscallObject, @@ -54,7 +55,7 @@ export function makeXsSubprocessFactory({ const { name: vatName, metered, - compareSyscalls, + compareSyscalls = requireIdenticalExceptStableVCSyscalls, useTranscript, sourcedConsole, } = managerOptions; diff --git a/packages/SwingSet/src/kernel/vat-loader/transcript.js b/packages/SwingSet/src/kernel/vat-loader/transcript.js index e90100fea3c4..de5f0c892c58 100644 --- a/packages/SwingSet/src/kernel/vat-loader/transcript.js +++ b/packages/SwingSet/src/kernel/vat-loader/transcript.js @@ -1,5 +1,21 @@ import djson from '../../lib/djson.js'; +// Indicate that a syscall is missing from the transcript but is safe to +// perform during replay +const missingSyscall = Symbol('missing transcript syscall'); + +// Indicate that a syscall is recorded in the transcript but can be safely +// ignored / skipped during replay. +const extraSyscall = Symbol('extra transcript syscall'); + +/** @typedef {typeof missingSyscall | typeof extraSyscall | Error | undefined} CompareSyscallsResult */ + +/** + * @param {any} vatID + * @param {object} originalSyscall + * @param {object} newSyscall + * @returns {CompareSyscallsResult} + */ export function requireIdentical(vatID, originalSyscall, newSyscall) { if (djson.stringify(originalSyscall) !== djson.stringify(newSyscall)) { console.error(`anachrophobia strikes vat ${vatID}`); @@ -10,6 +26,67 @@ export function requireIdentical(vatID, originalSyscall, newSyscall) { return undefined; } +const vcSyscallRE = /^vc\.\d+\.\|(?:schemata|label)$/; + +/** + * Liveslots currently has a deficiency which results in [virtual collections + * being sensitive to organic GC](https://github.com/Agoric/agoric-sdk/issues/6360). + * + * XS also has multiple issues causing memory to not be collected identically + * depending on the load from snapshot schedule. This results in organic GC + * triggering at different times based on which snapshot the worker was created + * from. + * + * Combined together, these bugs cause syscalls being emitted by liveslots at + * different times whether the execution occurred in a worker created from a + * more or less recent snapshot. With a strict check during transcript replay, + * this can cause [anachrophobia errors when restarting SwingSet](https://github.com/Agoric/agoric-sdk/issues/6588), + * or potentially when reloading a vat that was paged out. + * + * Thankfully the syscalls issued by liveslots for these virtual collection + * objects are both easily identifiable and stable over time. That means their + * response is always the same regardless when the syscall is made. + * + * This method enhances the basic identical check and returns sentinel values + * (unique symbols), indicating whether a syscall during replay requires to + * skip an entry from the transcript or perform the actual syscall because the + * entry is missing in the transcript. This works in conjunction with + * `simulateSyscall` which then performs the appropriate action. + * + * @param {any} vatID + * @param {object} originalSyscall + * @param {object} newSyscall + * @returns {CompareSyscallsResult} + */ +export function requireIdenticalExceptStableVCSyscalls( + vatID, + originalSyscall, + newSyscall, +) { + const error = requireIdentical(vatID, originalSyscall, newSyscall); + + if (error) { + if ( + originalSyscall[0] === 'vatstoreGet' && + vcSyscallRE.test(originalSyscall[1]) + ) { + // The syscall recorded in the transcript is for a virtual collection + // metadata get. It can be safely skipped. + console.warn(` mitigation: ignoring extra vc syscall`); + return extraSyscall; + } + + if (newSyscall[0] === 'vatstoreGet' && vcSyscallRE.test(newSyscall[1])) { + // The syscall performed by the vat is for a virtual collection metadata + // get. It can be safely performed during replay. + console.warn(` mitigation: falling through to syscall handler`); + return missingSyscall; + } + } + + return error; +} + export function makeTranscriptManager( vatKeeper, vatID, @@ -59,13 +136,44 @@ export function makeTranscriptManager( let replayError; function simulateSyscall(newSyscall) { - const s = playbackSyscalls.shift(); - const newReplayError = compareSyscalls(vatID, s.d, newSyscall); - if (newReplayError) { - replayError = newReplayError; - throw replayError; + while (playbackSyscalls.length) { + const compareError = compareSyscalls( + vatID, + playbackSyscalls[0].d, + newSyscall, + ); + + if (compareError === missingSyscall) { + // return `undefined` to indicate that this syscall cannot be simulated + // and needs to be performed (virtual collection metadata get) + return undefined; + } + + const s = playbackSyscalls.shift(); + + if (!compareError) { + return s.response; + } else if (compareError !== extraSyscall) { + replayError = compareError; + break; + } + + // Check the next transcript entry, skipping any extra syscalls recorded + // in the transcript (virtual collection metadata get) + } + + if (!replayError) { + // Error if the vat performs a syscall for which we don't have a + // corresponding entry in the transcript. + + // Note that if a vat performed an "allowed" vc metadata get syscall after + // we reach the end of the transcript, we would error instead of + // falling through and performing the syscall. However liveslots does not + // perform vc metadata get syscalls unless it needs to provide an entry + // to the program, which always results in subsequent syscalls. + replayError = new Error(`historical inaccuracy in replay of ${vatID}`); } - return s.response; + throw replayError; } function finishReplayDelivery(dnum) {