Skip to content

Commit

Permalink
fix(swingset): workaround XS garbage collection bugs
Browse files Browse the repository at this point in the history
DO NOT MERGE INTO MAIN BRANCH
Workaround for #6588

During transcript replay, handle divergent syscalls which retrieve
stable Virtual Collection metadata. These can happen at different times
than recorded during the transcript because of some bugs in the XS
engine making GC sensitive to reload from heap snapshots.
  • Loading branch information
mhofman committed Dec 13, 2022
1 parent 2c812d2 commit b2b336f
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 10 deletions.
8 changes: 5 additions & 3 deletions packages/SwingSet/src/kernel/vat-loader/manager-helper.js
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ import { makeTranscriptManager } from './transcript.js';
* @param {KernelSlog} kernelSlog
* @param {(vso: VatSyscallObject) => VatSyscallResult} vatSyscallHandler
* @param {boolean} workerCanBlock
* @param {(vatID: any, originalSyscall: any, newSyscall: any) => Error | undefined} [compareSyscalls]
* @param {(vatID: any, originalSyscall: any, newSyscall: any) => import('./transcript.js').CompareSyscallsResult} [compareSyscalls]
* @param {boolean} [useTranscript]
* @returns {ManagerKit}
*/
Expand Down Expand Up @@ -247,7 +247,9 @@ function makeManagerKit(
// but if the puppy deviates one inch from previous twitches, explode
kernelSlog.syscall(vatID, undefined, vso);
const vres = transcriptManager.simulateSyscall(vso);
return vres;
if (vres) {
return vres;
}
}

const vres = vatSyscallHandler(vso);
Expand All @@ -256,7 +258,7 @@ function makeManagerKit(
if (successFlag === 'ok' && data && !workerCanBlock) {
console.log(`warning: syscall returns data, but worker cannot get it`);
}
if (transcriptManager) {
if (transcriptManager && !transcriptManager.inReplay()) {
transcriptManager.addSyscall(vso, vres);
}
return vres;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import { assert, details as X, q } from '@agoric/assert';
import { ExitCode } from '@agoric/xsnap/api.js';
import { makeManagerKit } from './manager-helper.js';
import { requireIdenticalExceptStableVCSyscalls } from './transcript.js';

import {
insistVatSyscallObject,
Expand Down Expand Up @@ -55,7 +56,7 @@ export function makeXsSubprocessFactory({
const {
name: vatName,
metered,
compareSyscalls,
compareSyscalls = requireIdenticalExceptStableVCSyscalls,
useTranscript,
sourcedConsole,
} = managerOptions;
Expand Down
120 changes: 114 additions & 6 deletions packages/SwingSet/src/kernel/vat-loader/transcript.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,21 @@
import djson from '../../lib/djson.js';

// Indicate that a syscall is missing from the transcript but is safe to
// perform during replay
const missingSyscall = Symbol('missing transcript syscall');

// Indicate that a syscall is recorded in the transcript but can be safely
// ignored / skipped during replay.
const extraSyscall = Symbol('extra transcript syscall');

/** @typedef {typeof missingSyscall | typeof extraSyscall | Error | undefined} CompareSyscallsResult */

/**
* @param {any} vatID
* @param {object} originalSyscall
* @param {object} newSyscall
* @returns {CompareSyscallsResult}
*/
export function requireIdentical(vatID, originalSyscall, newSyscall) {
if (djson.stringify(originalSyscall) !== djson.stringify(newSyscall)) {
console.error(`anachrophobia strikes vat ${vatID}`);
Expand All @@ -10,6 +26,67 @@ export function requireIdentical(vatID, originalSyscall, newSyscall) {
return undefined;
}

const vcSyscallRE = /^vc\.\d+\.\|(?:schemata|label)$/;

/**
* Liveslots currently has a deficiency which results in [virtual collections
* being sensitive to organic GC](https://github.com/Agoric/agoric-sdk/issues/6360).
*
* XS also has multiple issues causing memory to not be collected identically
* depending on the load from snapshot schedule. This results in organic GC
* triggering at different times based on which snapshot the worker was created
* from.
*
* Combined together, these bugs cause syscalls being emitted by liveslots at
* different times whether the execution occurred in a worker created from a
* more or less recent snapshot. With a strict check during transcript replay,
* this can cause [anachrophobia errors when restarting SwingSet](https://github.com/Agoric/agoric-sdk/issues/6588),
* or potentially when reloading a vat that was paged out.
*
* Thankfully the syscalls issued by liveslots for these virtual collection
* objects are both easily identifiable and stable over time. That means their
* response is always the same regardless when the syscall is made.
*
* This method enhances the basic identical check and returns sentinel values
* (unique symbols), indicating whether a syscall during replay requires to
* skip an entry from the transcript or perform the actual syscall because the
* entry is missing in the transcript. This works in conjunction with
* `simulateSyscall` which then performs the appropriate action.
*
* @param {any} vatID
* @param {object} originalSyscall
* @param {object} newSyscall
* @returns {CompareSyscallsResult}
*/
export function requireIdenticalExceptStableVCSyscalls(
vatID,
originalSyscall,
newSyscall,
) {
const error = requireIdentical(vatID, originalSyscall, newSyscall);

if (error) {
if (
originalSyscall[0] === 'vatstoreGet' &&
vcSyscallRE.test(originalSyscall[1])
) {
// The syscall recorded in the transcript is for a virtual collection
// metadata get. It can be safely skipped.
console.warn(` mitigation: ignoring extra vc syscall`);
return extraSyscall;
}

if (newSyscall[0] === 'vatstoreGet' && vcSyscallRE.test(newSyscall[1])) {
// The syscall performed by the vat is for a virtual collection metadata
// get. It can be safely performed during replay.
console.warn(` mitigation: falling through to syscall handler`);
return missingSyscall;
}
}

return error;
}

export function makeTranscriptManager(
vatKeeper,
vatID,
Expand Down Expand Up @@ -59,13 +136,44 @@ export function makeTranscriptManager(
let replayError;

function simulateSyscall(newSyscall) {
const s = playbackSyscalls.shift();
const newReplayError = compareSyscalls(vatID, s.d, newSyscall);
if (newReplayError) {
replayError = newReplayError;
throw replayError;
while (playbackSyscalls.length) {
const compareError = compareSyscalls(
vatID,
playbackSyscalls[0].d,
newSyscall,
);

if (compareError === missingSyscall) {
// return `undefined` to indicate that this syscall cannot be simulated
// and needs to be performed (virtual collection metadata get)
return undefined;
}

const s = playbackSyscalls.shift();

if (!compareError) {
return s.response;
} else if (compareError !== extraSyscall) {
replayError = compareError;
break;
}

// Check the next transcript entry, skipping any extra syscalls recorded
// in the transcript (virtual collection metadata get)
}

if (!replayError) {
// Error if the vat performs a syscall for which we don't have a
// corresponding entry in the transcript.

// Note that if a vat performed an "allowed" vc metadata get syscall after
// we reach the end of the transcript, we would error instead of
// falling through and performing the syscall. However liveslots does not
// perform vc metadata get syscalls unless it needs to provide an entry
// to the program, which always results in subsequent syscalls.
replayError = new Error(`historical inaccuracy in replay of ${vatID}`);
}
return s.response;
throw replayError;
}

function finishReplayDelivery(dnum) {
Expand Down

0 comments on commit b2b336f

Please sign in to comment.