From 41afdc7ae8f1c11faa001e46580c01b9e9a87ff3 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Thu, 6 Aug 2020 10:43:53 +0800 Subject: [PATCH] [Rest Server] Update hived cell number calculation (#4757) Update hived cell number calculation and refine error messages. --- src/rest-server/src/middlewares/v2/hived.js | 37 +++++++++++++++------ 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/src/rest-server/src/middlewares/v2/hived.js b/src/rest-server/src/middlewares/v2/hived.js index e545c38d25..998e5d454c 100644 --- a/src/rest-server/src/middlewares/v2/hived.js +++ b/src/rest-server/src/middlewares/v2/hived.js @@ -175,10 +175,6 @@ const hivedValidate = async (protocolObj, username) => { let requestCellNumber = 0; const {cellQuota, cellUnits} = await getCellStatus(virtualCluster); for (let taskRole of Object.keys(protocolObj.taskRoles)) { - const {gpu = 0, cpu, memoryMB} = protocolObj.taskRoles[taskRole].resourcePerInstance; - const cellNumber = gpu === 0 ? cpu : gpu; - requestCellNumber += protocolObj.taskRoles[taskRole].instances * cellNumber; - const resourcePerCell = {}; for (const t of ['gpu', 'cpu', 'memory']) { resourcePerCell[t] = Math.min( @@ -190,7 +186,7 @@ const hivedValidate = async (protocolObj, username) => { priority: convertPriority(hivedConfig ? hivedConfig.jobPriorityClass : undefined), gpuType: null, pinnedCellId: null, - gpuNumber: cellNumber, + gpuNumber: 0, affinityGroup: null, }; if (hivedConfig && hivedConfig.taskRoles && taskRole in hivedConfig.taskRoles) { @@ -216,17 +212,36 @@ const hivedValidate = async (protocolObj, username) => { }; } - if (gpu > resourcePerCell.gpu * cellNumber || - cpu > resourcePerCell.cpu * cellNumber || - memoryMB > resourcePerCell.memory * cellNumber) { + const {gpu = 0, cpu, memoryMB} = protocolObj.taskRoles[taskRole].resourcePerInstance; + let requestedResource = ''; + let emptyResource = ''; + if (resourcePerCell.gpu === 0 && gpu > 0) { + requestedResource = resourcePerCell.gpu; + emptyResource = 'GPU'; + } else if (resourcePerCell.cpu === 0 && cpu > 0) { + requestedResource = resourcePerCell.cpu; + emptyResource = 'CPU'; + } else if (resourcePerCell.memory === 0 && memoryMB > 0) { + requestedResource = resourcePerCell.memory; + emptyResource = 'memory'; + } + if (emptyResource !== '') { throw createError( 'Bad Request', 'InvalidProtocolError', - `Taskrole ${taskRole} requests ${gpu} GPU, ${cpu} CPU, ${memoryMB}MB memory; ` + - `sku allows ${resourcePerCell.gpu} GPU, ${resourcePerCell.cpu} CPU, ${resourcePerCell.memory}MB memory per cell.` + `Taskrole ${taskRole} requests ${requestedResource} ${emptyResource}, but SKU does not ` + + `configure ${emptyResource}. Please contact admin if the taskrole needs ${emptyResource} resources.` ); } + const cellNumber = Math.max( + Math.ceil(gpu / resourcePerCell.gpu), + Math.ceil(cpu / resourcePerCell.cpu), + Math.ceil(memoryMB / resourcePerCell.memory), + ); + podSpec.gpuNumber = cellNumber; + requestCellNumber += protocolObj.taskRoles[taskRole].instances * cellNumber; + protocolObj.taskRoles[taskRole].hivedPodSpec = podSpec; } @@ -234,7 +249,7 @@ const hivedValidate = async (protocolObj, username) => { throw createError( 'Bad Request', 'InvalidProtocolError', - `Exceed ${cellQuota} GPU quota in ${virtualCluster} VC.` + `Job requests ${requestCellNumber} SKUs, exceeds maximum ${cellQuota} SKUs in VC ${virtualCluster}.` ); }