Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

make DBController tolerant to wrong framework request #5284

Merged
merged 6 commits into from
Feb 8, 2021
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions src/database-controller/src/common/framework.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,45 @@ const mockFrameworkStatus = () => {
};
};

const mockFailedFrameworkStatus = () => {
// Mock a failed status for framework.
// We only use it for frameworks that never start.
const dateStr = (new Date()).toISOString();
return {
yqwang-ms marked this conversation as resolved.
Show resolved Hide resolved
yqwang-ms marked this conversation as resolved.
Show resolved Hide resolved
state: 'Completed',
startTime: dateStr,
runTime: dateStr,
transitionTime: dateStr,
completionTime: dateStr,
attemptStatus: {
completionStatus: {
id: 0,
yqwang-ms marked this conversation as resolved.
Show resolved Hide resolved
code: -1007,
diagnostics: 'Framework schema is submitted to database, but cannot be synchronized to API server due to unrecoverable error.',
phase: 'UnrecoverableSchemaSynchronizeError',
trigger: {
message: "Database controller cannot synchronize framework schema to API server.",
taskIndex: null,
taskRoleName: null,
},
yqwang-ms marked this conversation as resolved.
Show resolved Hide resolved
yqwang-ms marked this conversation as resolved.
Show resolved Hide resolved
type: {
attributes: ["Permanent"],
name: "Failed",
},
startTime: dateStr,
runTime: dateStr,
completionTime: dateStr,
},
taskRoleStatuses: [],
},
retryPolicyStatus: {
retryDelaySec: null,
totalRetriedCount: 0,
accountableRetriedCount: 0,
},
};
};

const convertFrameworkState = (state, exitCode) => {
switch (state) {
case 'AttemptCreationPending':
Expand Down Expand Up @@ -245,6 +284,10 @@ class Snapshot {
return this._snapshot.status.state;
}

getTotalRetriedCount() {
return _.get(this._snapshot, 'status.retryPolicyStatus.totalRetriedCount', 0);
}

getSnapshot() {
return _.cloneDeep(this._snapshot);
}
Expand Down Expand Up @@ -290,6 +333,20 @@ class Snapshot {
}
this._snapshot = jsonmergepatch.apply(this._snapshot, patchData);
}

setFailed() {
// Manually set the framework's status to failed.
// This is used when we cannot sync the framework request to api server,
// and we're sure that the error is unrecoverable.

// For now, we only use it for frameworks that never start.
console.log(this.getTotalRetriedCount(), this.getState())
if (!this.getTotalRetriedCount() === 0 || this.getState() !== 'AttemptCreationPending') {
throw new Error('setFailed() only works for framework that never start!')
yqwang-ms marked this conversation as resolved.
Show resolved Hide resolved
}

this._snapshot.status = mockFailedFrameworkStatus();
}
}

// Class Add-ons handles creation/patching/deletion of job add-ons.
Expand Down
1 change: 1 addition & 0 deletions src/database-controller/src/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
"openpaidbsdk": "file:../sdk",
"p-queue": "^6.6.1",
"statuses": "^2.0.0",
"uuid": "^8.3.2",
"winston": "2"
},
"_moduleAliases": {
Expand Down
40 changes: 37 additions & 3 deletions src/database-controller/src/poller/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ const interval = require('interval-promise');
const config = require('@dbc/poller/config');
const fetch = require('node-fetch');
const { deleteFramework } = require('@dbc/common/k8s');
const _ = require('lodash');
const {
isUnrecoverableResponse,
generateClusterEventUpdate,
} = require('@dbc/poller/utils');
// Here, we use AsyncLock to control the concurrency of frameworks with the same name;
// e.g. If framework A has request1, request2, and request3, we use AsyncLock
// to ensure they will be processed in order.
Expand All @@ -33,8 +38,8 @@ const databaseModel = new DatabaseModel(
config.maxDatabaseConnection,
);

async function postMockedDeleteEvent(snapshot) {
await fetch(`${config.writeMergerUrl}/api/v1/watchEvents/DELETED`, {
async function postMockedEvent(snapshot, eventType) {
await fetch(`${config.writeMergerUrl}/api/v1/watchEvents/${eventType}`, {
method: 'POST',
body: snapshot.getString(),
headers: { 'Content-Type': 'application/json' },
Expand Down Expand Up @@ -65,7 +70,7 @@ function deleteHandler(snapshot, pollingTs) {
`Cannot find framework ${frameworkName} in API Server. Will mock a deletion to write merger. Error:`,
err,
);
await postMockedDeleteEvent(snapshot);
await postMockedEvent(snapshot, 'DELETED');
} else {
// for non-404 error
throw err;
Expand Down Expand Up @@ -95,6 +100,35 @@ function synchronizeHandler(snapshot, addOns, pollingTs) {
);
}),
)
.catch(err => {
// if we are sure the error is not recoverable, we will mock a failed framework, and record this in events.
if (_.has(err, 'response') && isUnrecoverableResponse(err.response)) {
logger.warn(
`An error happened when synchronize request for framework ${frameworkName} and pollingTs=${pollingTs}. We are sure the error is not recoverable. Will mock a failed framework.`,
err,
);
queue.add(async () => {
// For safety reason, we only consider the job that never starts.
// snapshot.setFailed() will raise an error if the framework is not in state AttemptCreationPending or has retry history.
snapshot.setFailed();
await postMockedEvent(snapshot, 'MODIFIED');
await databaseModel.FrameworkEvent.create(
generateClusterEventUpdate(
snapshot,
'Warning',
'UnrecoverableSynchronizeFailure',
yqwang-ms marked this conversation as resolved.
Show resolved Hide resolved
_.get(err, 'response.body.message', 'unknown'),
),
);
}).catch(err => logger.error(
`An error happened when mock failed event for framework ${frameworkName} and pollingTs=${pollingTs}:`,
err,
));
hzy46 marked this conversation as resolved.
Show resolved Hide resolved
} else {
// if the error is recoverable, or we are not sure, just throw it
throw err;
}
})
.catch(err => {
logger.error(
`An error happened when synchronize request for framework ${frameworkName} and pollingTs=${pollingTs}:`,
Expand Down
66 changes: 66 additions & 0 deletions src/database-controller/src/poller/utils.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
require('module-alias/register');
const _ = require('lodash');
const { v4: uuidv4 } = require('uuid');

function isUnrecoverableResponse(response) {
// indentify if the response is recoverable
const statusCode = _.get(response, 'statusCode', 0);
const message = _.get(response, 'body.message', '');
yqwang-ms marked this conversation as resolved.
Show resolved Hide resolved
if (statusCode === 413) {
// Code 413 means Payload Too Large.
// It happens when we PATCH a framework, and the PATCH's payload is too large.
return true;
} else if (
statusCode === 500 &&
message.indexOf(
'code = ResourceExhausted desc = trying to send message larger than max',
) !== -1
) {
// If we POST a large framework, Kubernetes API server will raise this error.
return true;
} else if (statusCode === 422) {
// If the request is not valid, Kubernetes API server will return code 422.
return true;
}
return false;
}

function generateClusterEventUpdate(snapshot, type, reason, message) {
const frameworkName = snapshot.getName();
const date = new Date();
const mockedObj = {
kind: 'Event',
apiVersion: 'v1',
metadata: {
name: `db-controller-event-for-${frameworkName}`,
namespace: 'default',
uid: uuidv4(),
creationTimestamp: date,
},
involvedObject: {},
reason: reason,
message: message,
firstTimestamp: date,
lastTimestamp: date,
count: 1,
type: type,
};
return {
uid: mockedObj.metadata.uid,
frameworkName: frameworkName,
type: mockedObj.type,
reason: mockedObj.reason,
message: mockedObj.message,
firstTimestamp: mockedObj.firstTimestamp,
lastTimestamp: mockedObj.lastTimestamp,
count: mockedObj.count,
event: JSON.stringify(mockedObj),
};
}

module.exports = {
isUnrecoverableResponse,
generateClusterEventUpdate,
};
5 changes: 5 additions & 0 deletions src/database-controller/src/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2921,6 +2921,11 @@ uuid@^3.3.2, uuid@^3.3.3:
resolved "https://registry.yarnpkg.com/uuid/-/uuid-3.4.0.tgz#b23e4358afa8a202fe7a100af1f5f883f02007ee"
integrity sha512-HjSDRw6gZE5JMggctHBcjVak08+KEVhSIiDzFnT9S9aegmp85S/bReBVTb4QTFaRNptJ9kuYaNhnbNEOkbKb/A==

uuid@^8.3.2:
version "8.3.2"
resolved "https://registry.yarnpkg.com/uuid/-/uuid-8.3.2.tgz#80d5b5ced271bb9af6c445f21a1a04c606cefbe2"
integrity sha512-+NYs2QeMWy+GWFOEm9xnn6HCDp0l7QBD7ml8zLUmJ+93Q5NF0NocErnwkTkXVFNiX3/fpC6afS8Dhb/gz7R7eg==

v8-compile-cache@^2.0.3:
version "2.1.1"
resolved "https://registry.yarnpkg.com/v8-compile-cache/-/v8-compile-cache-2.1.1.tgz#54bc3cdd43317bca91e35dcaf305b1a7237de745"
Expand Down
14 changes: 14 additions & 0 deletions src/k8s-job-exit-spec/config/k8s-job-exit-spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,20 @@ spec:
- reasonRegex: '(?i)^UnexpectedPredicateFailureType$'
messageRegex: '(?ms).*'

- code: -1007
phrase: UnrecoverableSchemaSynchronizeError
yqwang-ms marked this conversation as resolved.
Show resolved Hide resolved
issuer: PAI_K8S
causer: PAI_K8S
type: PLATFORM_FAILURE
stage: LAUNCHING
behavior: PERMANENT
reaction: NEVER_RETRY
reason: "Framework schema is submitted to database, but cannot be synchronized to API server due to unrecoverable error"
repro:
- "Submit a job with a very large job schema"
solution:
- "Please refer to job events for detailed reasons"


###########################################################################
# Range: [-1399, -1200]
Expand Down