Skip to content

Commit

Permalink
feat(dispatch-service): add limit for retries of status for jobs
Browse files Browse the repository at this point in the history
Jobs with an unknown status now have a limit on the number of retries before being marked as failed
to prevent never ending jobs due to some unknown status response
  • Loading branch information
bilalshaikh42 committed Oct 24, 2021
1 parent 1bea042 commit f187b03
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ export class DispatchProcessor {
slurmJobId: slurmjobId.toString(),
simId: data.simId,
isPublic: data.isPublic,
retryCount: 0,
};

this.monitorQueue.add(monitorData);
Expand Down
24 changes: 20 additions & 4 deletions apps/dispatch-service/src/app/submission/monitor.processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import { Logger } from '@nestjs/common';
import { Job, Queue } from 'bull';
import { HpcService } from '../services/hpc/hpc.service';
import { SimulationStatusService } from '../services/simulationStatus.service';

const MAX_MONITOR_RETRY = 20;
@Processor(JobQueue.monitor)
export class MonitorProcessor {
private readonly logger = new Logger(MonitorProcessor.name);
Expand All @@ -24,11 +24,12 @@ export class MonitorProcessor {
) {}

@Process()
private async handleMonitoring(job: Job): Promise<void> {
private async handleMonitoring(job: Job<MonitorJob>): Promise<void> {
const data = job.data;
const slurmJobId = data.slurmJobId;
const isPublic = data.isPublic;
const simId = data.simId;
let retryCount = data.retryCount;
const DELAY = 5000;
const jobStatus: SimulationRunStatus | null =
await this.hpcService.getJobStatus(slurmJobId);
Expand All @@ -48,12 +49,27 @@ export class MonitorProcessor {
jobStatus == SimulationRunStatus.QUEUED ||
jobStatus == SimulationRunStatus.RUNNING
) {
this.monitorQueue.add({ slurmJobId, simId, isPublic }, { delay: DELAY });
this.monitorQueue.add(
{ slurmJobId, simId, isPublic, retryCount },
{ delay: DELAY },
);
} else {
this.logger.warn(
`${simId} skipped update, due to unknown status of ${jobStatus}`,
);
this.monitorQueue.add({ slurmJobId, simId, isPublic }, { delay: DELAY });
// If we keep getting some unknown status that does not resolve, fail the job after some limit of retries
if (retryCount < MAX_MONITOR_RETRY) {
retryCount = retryCount + 1;
this.monitorQueue.add(
{ slurmJobId, simId, isPublic, retryCount },
{ delay: DELAY },
);
} else {
this.logger.error(
`${simId} failed due to exceeded retry limit of status ${jobStatus}`,
);
this.failQueue.add({ simId, reason: message });
}
}
}
}
1 change: 1 addition & 0 deletions libs/messages/messages/src/lib/queues.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ export class MonitorJob {
slurmJobId!: string;
simId!: string;
isPublic!: boolean;
retryCount!: number;
}

export class DispatchJob {
Expand Down

0 comments on commit f187b03

Please sign in to comment.