From f6c7528ea6bbd01c70767d14bd59da1b05e4bfa4 Mon Sep 17 00:00:00 2001 From: DavidGOrtega Date: Tue, 8 Jun 2021 15:47:26 +0000 Subject: [PATCH 01/12] Restart uncompleted and runner exits --- bin/cml-runner.js | 78 +++++++++++++---------- src/cml.js | 30 +++++---- src/drivers/bitbucket_cloud.js | 6 +- src/drivers/github.js | 109 +++++++++++++++++++++++++++++++-- src/drivers/gitlab.js | 24 ++++++++ 5 files changed, 196 insertions(+), 51 deletions(-) diff --git a/bin/cml-runner.js b/bin/cml-runner.js index 89edae82c..38b7928b5 100755 --- a/bin/cml-runner.js +++ b/bin/cml-runner.js @@ -17,7 +17,7 @@ const { RUNNER_PATH = `${WORKDIR_BASE}/${NAME}`, RUNNER_IDLE_TIMEOUT = 5 * 60, - RUNNER_DESTROY_DELAY = 30, + RUNNER_DESTROY_DELAY = 10, RUNNER_LABELS = 'cml', RUNNER_NAME = NAME, RUNNER_SINGLE = false, @@ -28,10 +28,10 @@ const { } = process.env; let cml; -let RUNNER_LAUNCHED = false; +let RUNNER; let RUNNER_TIMEOUT_TIMER = 0; let RUNNER_SHUTTING_DOWN = false; -const RUNNER_JOBS_RUNNING = []; +let RUNNER_JOBS_RUNNING = []; const shutdown = async (opts) => { if (RUNNER_SHUTTING_DOWN) return; @@ -39,7 +39,7 @@ const shutdown = async (opts) => { RUNNER_SHUTTING_DOWN = true; let { error, cloud } = opts; - const { name, workdir = '' } = opts; + const { name, workdir = '', tfResource } = opts; const tfPath = workdir; console.log( @@ -49,7 +49,7 @@ const shutdown = async (opts) => { const unregisterRunner = async () => { try { - console.log('Unregistering runner...'); + console.log(`Unregistering runner ${name}...`); await cml.unregisterRunner({ name }); console.log('\tSuccess'); } catch (err) { @@ -61,7 +61,7 @@ const shutdown = async (opts) => { const shutdownDockerMachine = async () => { console.log('docker-machine destroy...'); console.log( - 'Docker machine is deprecated and this will be removed!! Check how to deploy using our tf provider.' + 'Docker machine is deprecated and will be removed!! Check how to deploy using our tf provider.' ); try { await exec(`echo y | docker-machine rm ${DOCKER_MACHINE}`); @@ -71,22 +71,6 @@ const shutdown = async (opts) => { } }; - const shutdownTf = async () => { - const { tfResource } = opts; - - if (!tfResource) { - console.log(`\tNo TF resource found`); - return; - } - - try { - await tf.destroy({ dir: tfPath }); - } catch (err) { - console.error(`\tFailed Terraform destroy: ${err.message}`); - error = err; - } - }; - const destroyTerraform = async () => { try { console.log(await tf.destroy({ dir: tfPath })); @@ -99,17 +83,33 @@ const shutdown = async (opts) => { if (cloud) { await destroyTerraform(); } else { - RUNNER_LAUNCHED && (await unregisterRunner()); - - console.log( - `\tDestroy scheduled: ${RUNNER_DESTROY_DELAY} seconds remaining.` - ); await sleep(RUNNER_DESTROY_DELAY); + try { + console.log('RUNNER_JOBS_RUNNING', RUNNER_JOBS_RUNNING); + if (RUNNER_JOBS_RUNNING.length) { + await Promise.all( + RUNNER_JOBS_RUNNING.map( + async (jobId) => await cml.pipelineRestart({ jobId }) + ) + ); + } + } catch (err) { + console.log(err); + } + + RUNNER && (await unregisterRunner()); + + if (!tfResource) { + console.log(`\tNo TF resource found`); + } else { + await destroyTerraform(); + } + DOCKER_MACHINE && (await shutdownDockerMachine()); - await shutdownTf(); } + RUNNER && RUNNER.kill('SIGINT'); process.exit(error ? 1 : 0); }; @@ -214,17 +214,29 @@ const runLocal = async (opts) => { idleTimeout }); - const dataHandler = (data) => { - const log = cml.parseRunnerLog({ data }); + const dataHandler = async (data) => { + const log = await cml.parseRunnerLog({ data }); log && console.log(JSON.stringify(log)); if (log && log.status === 'job_started') { - RUNNER_JOBS_RUNNING.push(1); + RUNNER_JOBS_RUNNING.push(log.job); RUNNER_TIMEOUT_TIMER = 0; } else if (log && log.status === 'job_ended') { - RUNNER_JOBS_RUNNING.pop(); + const { job } = log; + if (!RUNNER_SHUTTING_DOWN) { + const jobs = job + ? [job] + : (await cml.pipelineJobs({ ids: RUNNER_JOBS_RUNNING })) + .filter((job) => job.status === 'completed') + .map((job) => job.id); + + RUNNER_JOBS_RUNNING = RUNNER_JOBS_RUNNING.filter( + (id) => !jobs.includes(id) + ); + } } }; + proc.stderr.on('data', dataHandler); proc.stdout.on('data', dataHandler); proc.on('uncaughtException', () => shutdown(opts)); @@ -242,7 +254,7 @@ const runLocal = async (opts) => { }, 1000); } - RUNNER_LAUNCHED = true; + RUNNER = proc; }; const run = async (opts) => { diff --git a/src/cml.js b/src/cml.js index b7df63add..230aa2ef9 100644 --- a/src/cml.js +++ b/src/cml.js @@ -143,59 +143,57 @@ class CML { return await getDriver(this).runnerToken(); } - parseRunnerLog(opts = {}) { + async parseRunnerLog(opts = {}) { let { data } = opts; if (!data) return; + const date = new Date(); + try { data = data.toString('utf8'); let log = { level: 'info', - time: new Date().toISOString(), + date: date.toISOString(), repo: this.repo }; if (this.driver === GITHUB) { if (data.includes('Running job')) { - log.job = ''; + const { id } = await getDriver(this).job({ time: date.getTime() }); + log.job = id; log.status = 'job_started'; - return log; } else if ( data.includes('Job') && data.includes('completed with result') ) { log.job = ''; log.status = 'job_ended'; - log.success = data.endsWith('Succeeded'); + log.success = data.includes('Succeeded'); log.level = log.success ? 'info' : 'error'; - return log; } else if (data.includes('Listening for Jobs')) { log.status = 'ready'; - return log; } + return log; } if (this.driver === GITLAB) { const { msg, job } = JSON.parse(data); + log = { ...log, job }; if (msg.endsWith('received')) { - log = { ...log, job }; log.status = 'job_started'; - return log; } else if ( msg.startsWith('Job failed') || msg.startsWith('Job succeeded') ) { - log = { ...log, job }; log.status = 'job_ended'; log.success = !msg.startsWith('Job failed'); log.level = log.success ? 'info' : 'error'; - return log; } else if (msg.includes('Starting runner for')) { log.status = 'ready'; - return log; } + return log; } } catch (err) { console.log(`Failed parsing log: ${err.message}`); @@ -321,6 +319,14 @@ Automated commits for ${this.repo}/commit/${sha} created by CML. return renderPr(url); } + async pipelineRestart(opts) { + return await getDriver(this).pipelineRestart(opts); + } + + async pipelineJobs(opts) { + return await getDriver(this).pipelineJobs(opts); + } + logError(e) { console.error(e.message); } diff --git a/src/drivers/bitbucket_cloud.js b/src/drivers/bitbucket_cloud.js index 1f6c097da..29353a62a 100644 --- a/src/drivers/bitbucket_cloud.js +++ b/src/drivers/bitbucket_cloud.js @@ -86,7 +86,7 @@ class BitBucketCloud { } async runnersByLabels(opts = {}) { - throw new Error('BitBucket Cloud does not support runner_by_labels!'); + throw new Error('BitBucket Cloud does not support runnerByLabels!'); } async prCreate(opts = {}) { @@ -148,6 +148,10 @@ class BitBucketCloud { }); } + async pipelineRestart(opts = {}) { + throw new Error('BitBucket Cloud does not support workflowRestart!'); + } + async request(opts = {}) { const { token, api } = this; const { endpoint, method = 'GET', body } = opts; diff --git a/src/drivers/github.js b/src/drivers/github.js index 140d52614..425bf77ec 100644 --- a/src/drivers/github.js +++ b/src/drivers/github.js @@ -144,18 +144,18 @@ class Github { const { name } = opts; const { owner, repo } = ownerRepo({ uri: this.repo }); const { actions } = octokit(this.token, this.repo); - const { id: runnerId } = await this.runnerByName({ name }); + const { id: runner_id } = await this.runnerByName({ name }); if (typeof repo !== 'undefined') { await actions.deleteSelfHostedRunnerFromRepo({ owner, repo, - runnerId + runner_id }); } else { await actions.deleteSelfHostedRunnerFromOrg({ org: owner, - runnerId + runner_id }); } } @@ -170,7 +170,7 @@ class Github { await fs.unlink(runnerCfg); } catch (e) { const arch = process.platform === 'darwin' ? 'osx-x64' : 'linux-x64'; - const ver = '2.274.2'; + const ver = '2.278.0'; const destination = resolve(workdir, 'actions-runner.tar.gz'); const url = `https://github.com/actions/runner/releases/download/v${ver}/actions-runner-${arch}-${ver}.tar.gz`; await download({ url, path: destination }); @@ -226,7 +226,7 @@ class Github { async runnerByName(opts = {}) { const { name } = opts; const runners = await this.getRunners(opts); - const runner = runners.filter((runner) => runner.name === name)[0]; + const runner = runners.find((runner) => runner.name === name); if (runner) return { id: runner.id, name: runner.name }; } @@ -288,6 +288,105 @@ class Github { }); } + async pipelineJobs(opts = {}) { + const { ids } = opts; + const { owner, repo } = ownerRepo({ uri: this.repo }); + const { actions } = octokit(this.token, this.repo); + + const jobs = await Promise.all( + ids.map(async (id) => { + const { data } = await actions.getJobForWorkflowRun({ + owner, + repo, + job_id: id + }); + + return data; + }) + ); + + return jobs.map((job) => { + const { id, started_at: date, run_id: runId, status } = job; + return { id, date, runId, status }; + }); + } + + async job(opts = {}) { + const { time, status = 'queued' } = opts; + const { owner, repo } = ownerRepo({ uri: this.repo }); + const { actions } = octokit(this.token, this.repo); + + const { + data: { workflow_runs: workflowRuns } + } = await actions.listWorkflowRunsForRepo({ + owner, + repo, + status + }); + + let runJobs = await Promise.all( + workflowRuns.map(async (run) => { + const { + data: { jobs } + } = await actions.listJobsForWorkflowRun({ + owner, + repo, + run_id: run.id, + status + }); + + return jobs; + }) + ); + + runJobs = [].concat.apply([], runJobs).map((job) => { + const { id, started_at: date, run_id: runId } = job; + return { id, date, runId }; + }); + + const job = runJobs.reduce((prev, curr) => { + const diffTime = (job) => Math.abs(new Date(job.date).getTime() - time); + return diffTime(curr) < diffTime(prev) ? curr : prev; + }); + + return job; + } + + async pipelineRestart(opts = {}) { + const { jobId: job_id } = opts; + const { owner, repo } = ownerRepo({ uri: this.repo }); + const { actions } = octokit(this.token, this.repo); + + const { + data: { run_id } + } = await actions.getJobForWorkflowRun({ + owner, + repo, + job_id + }); + + try { + await actions.cancelWorkflowRun({ + owner, + repo, + run_id + }); + } catch (err) { + console.log(err); + // HANDLES: Cannot cancel a workflow run that is completed. + } + + try { + await actions.reRunWorkflow({ + owner, + repo, + run_id + }); + } catch (err) { + console.log(err); + } + } + get sha() { if (GITHUB_EVENT_NAME === 'pull_request') return github.context.payload.pull_request.head.sha; diff --git a/src/drivers/gitlab.js b/src/drivers/gitlab.js index 85a9dd677..dfaa3826b 100644 --- a/src/drivers/gitlab.js +++ b/src/drivers/gitlab.js @@ -230,6 +230,30 @@ class Gitlab { }); } + async pipelineRestart(opts = {}) { + const projectPath = await this.projectPath(); + const { jobId } = opts; + + const { + pipeline: { id } + } = await this.request({ + endpoint: `/projects/${projectPath}/jobs/${jobId}` + }); + + let status; + while (!status || status === 'running') + ({ status } = await this.request({ + endpoint: `/projects/${projectPath}/pipelines/${id}/cancel`, + method: 'POST' + })); + + while (status !== 'running') + ({ status } = await this.request({ + endpoint: `/projects/${projectPath}/pipelines/${id}/retry`, + method: 'POST' + })); + } + async request(opts = {}) { const { token } = this; const { endpoint, method = 'GET', body, raw } = opts; From b70acb4c59218f21d82f2957547e9c7c6cb445b9 Mon Sep 17 00:00:00 2001 From: DavidGOrtega Date: Tue, 8 Jun 2021 15:52:00 +0000 Subject: [PATCH 02/12] no log --- src/drivers/github.js | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/drivers/github.js b/src/drivers/github.js index 425bf77ec..67baf05c4 100644 --- a/src/drivers/github.js +++ b/src/drivers/github.js @@ -372,7 +372,6 @@ class Github { run_id }); } catch (err) { - console.log(err); // HANDLES: Cannot cancel a workflow run that is completed. } @@ -382,9 +381,7 @@ class Github { repo, run_id }); - } catch (err) { - console.log(err); - } + } catch (err) {} } get sha() { From 150178ef18c5bfa8292b1929b6204725f726c934 Mon Sep 17 00:00:00 2001 From: DavidGOrtega Date: Tue, 8 Jun 2021 16:09:26 +0000 Subject: [PATCH 03/12] remove log --- bin/cml-runner.js | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/cml-runner.js b/bin/cml-runner.js index 38b7928b5..74739456c 100755 --- a/bin/cml-runner.js +++ b/bin/cml-runner.js @@ -86,7 +86,6 @@ const shutdown = async (opts) => { await sleep(RUNNER_DESTROY_DELAY); try { - console.log('RUNNER_JOBS_RUNNING', RUNNER_JOBS_RUNNING); if (RUNNER_JOBS_RUNNING.length) { await Promise.all( RUNNER_JOBS_RUNNING.map( From 951762bfa0655e3ba5c0250d7db20e7cc3e73566 Mon Sep 17 00:00:00 2001 From: DavidGOrtega Date: Tue, 8 Jun 2021 16:32:37 +0000 Subject: [PATCH 04/12] overcome 72 hours limit --- bin/cml-runner.js | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/bin/cml-runner.js b/bin/cml-runner.js index 74739456c..55b42e793 100755 --- a/bin/cml-runner.js +++ b/bin/cml-runner.js @@ -89,7 +89,7 @@ const shutdown = async (opts) => { if (RUNNER_JOBS_RUNNING.length) { await Promise.all( RUNNER_JOBS_RUNNING.map( - async (jobId) => await cml.pipelineRestart({ jobId }) + async (job) => await cml.pipelineRestart({ jobId: job.id }) ) ); } @@ -218,7 +218,7 @@ const runLocal = async (opts) => { log && console.log(JSON.stringify(log)); if (log && log.status === 'job_started') { - RUNNER_JOBS_RUNNING.push(log.job); + RUNNER_JOBS_RUNNING.push({ id: log.job, date: log.date }); RUNNER_TIMEOUT_TIMER = 0; } else if (log && log.status === 'job_ended') { const { job } = log; @@ -230,7 +230,7 @@ const runLocal = async (opts) => { .map((job) => job.id); RUNNER_JOBS_RUNNING = RUNNER_JOBS_RUNNING.filter( - (id) => !jobs.includes(id) + (job) => !jobs.includes(job.id) ); } } @@ -253,6 +253,19 @@ const runLocal = async (opts) => { }, 1000); } + if (cml.driver === 'github') { + const watcher = setInterval(() => { + RUNNER_JOBS_RUNNING.forEach((job) => { + const seventyTwoMinusFive = 72 * 60 * 60 * 1000 - 5 * 60 * 1000; + if ( + new Date().getTime() - new Date(job.date).getTime() > + seventyTwoMinusFive + ) + shutdown(opts) && clearInterval(watcher); + }); + }, 60 * 1000); + } + RUNNER = proc; }; From b93ce6f3e132abc146929ca32a28f077e08474ea Mon Sep 17 00:00:00 2001 From: DavidGOrtega Date: Tue, 8 Jun 2021 17:00:48 +0000 Subject: [PATCH 05/12] retry param --- bin/cml-runner.js | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/bin/cml-runner.js b/bin/cml-runner.js index 55b42e793..f6aebe6f8 100755 --- a/bin/cml-runner.js +++ b/bin/cml-runner.js @@ -22,6 +22,7 @@ const { RUNNER_NAME = NAME, RUNNER_SINGLE = false, RUNNER_REUSE = false, + RUNNER_RETRY = false, RUNNER_DRIVER, RUNNER_REPO, REPO_TOKEN @@ -39,7 +40,7 @@ const shutdown = async (opts) => { RUNNER_SHUTTING_DOWN = true; let { error, cloud } = opts; - const { name, workdir = '', tfResource } = opts; + const { name, workdir = '', tfResource, retry } = opts; const tfPath = workdir; console.log( @@ -86,7 +87,7 @@ const shutdown = async (opts) => { await sleep(RUNNER_DESTROY_DELAY); try { - if (RUNNER_JOBS_RUNNING.length) { + if (retry && RUNNER_JOBS_RUNNING.length) { await Promise.all( RUNNER_JOBS_RUNNING.map( async (job) => await cml.pipelineRestart({ jobId: job.id }) @@ -203,7 +204,7 @@ const runCloud = async (opts) => { const runLocal = async (opts) => { console.log(`Launching ${cml.driver} runner`); - const { workdir, name, labels, single, idleTimeout } = opts; + const { workdir, name, labels, single, idleTimeout, retry } = opts; const proc = await cml.startRunner({ workdir, @@ -253,7 +254,7 @@ const runLocal = async (opts) => { }, 1000); } - if (cml.driver === 'github') { + if (retry && cml.driver === 'github') { const watcher = setInterval(() => { RUNNER_JOBS_RUNNING.forEach((job) => { const seventyTwoMinusFive = 72 * 60 * 60 * 1000 - 5 * 60 * 1000; @@ -351,6 +352,12 @@ const opts = yargs .default('name', RUNNER_NAME) .describe('name', 'Name displayed in the repository once registered') + .boolean('retry') + .default('retry', RUNNER_RETRY) + .describe( + 'single', + 'Automatically retries a run which jobs did not finish due to runner disposal or reached 72 hours in Github' + ) .boolean('single') .default('single', RUNNER_SINGLE) .describe('single', 'Exit after running a single job') From cbffb848f63266ca1bbf345d709d7c8fefd052d9 Mon Sep 17 00:00:00 2001 From: DavidGOrtega Date: Tue, 8 Jun 2021 17:06:08 +0000 Subject: [PATCH 06/12] docs --- README.md | 12 ++++++++---- bin/cml-runner.js | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 85dbe0867..595c2233e 100644 --- a/README.md +++ b/README.md @@ -455,7 +455,10 @@ Options: jobs before shutting down. Setting it to 0 disables automatic shutdown [default: 300] --name Name displayed in the repository once registered - [default: "cml-4wdd123kha"] + [default: "cml-byujx1p0jz"] + --retry Automatically retries a run which jobs did not + finish due to runner disposal or reached 72 hours + in Github [boolean] [default: false] --single Exit after running a single job [boolean] [default: false] --reuse Don't launch a new runner if an existing one has @@ -471,14 +474,15 @@ Options: runner on the repository. If not specified, it will be inferred from the environment --cloud Cloud to deploy the runner - [choices: "aws", "azure"] + [choices: "aws", "azure", "kubernetes"] --cloud-region Region where the instance is deployed. Choices: [us-east, us-west, eu-west, eu-north]. Also accepts native cloud regions [default: "us-west"] --cloud-type Instance type. Choices: [m, l, xl]. Also supports native types like i.e. t2.micro - --cloud-gpu GPU type. [choices: "nogpu", "k80", "tesla"] - --cloud-hdd-size HDD size in GB. + --cloud-gpu GPU type. + [choices: "nogpu", "k80", "v100", "tesla"] + --cloud-hdd-size HDD size in GB --cloud-ssh-private Custom private RSA SSH key. If not provided an automatically generated throwaway key will be used [default: ""] diff --git a/bin/cml-runner.js b/bin/cml-runner.js index f6aebe6f8..8396b398f 100755 --- a/bin/cml-runner.js +++ b/bin/cml-runner.js @@ -355,7 +355,7 @@ const opts = yargs .boolean('retry') .default('retry', RUNNER_RETRY) .describe( - 'single', + 'retry', 'Automatically retries a run which jobs did not finish due to runner disposal or reached 72 hours in Github' ) .boolean('single') From 29e741d3a2ce9f3c65dc0aead5fd6b2b30c2d8e6 Mon Sep 17 00:00:00 2001 From: davidgortega Date: Fri, 18 Jun 2021 13:34:20 +0200 Subject: [PATCH 07/12] Update help and set GH_5_MIN_TIMEOUT --- README.md | 9 +++++---- bin/cml-runner.js | 12 ++++++------ 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index b855fbc66..e098a499a 100644 --- a/README.md +++ b/README.md @@ -457,10 +457,10 @@ Options: jobs before shutting down. Setting it to 0 disables automatic shutdown [default: 300] --name Name displayed in the repository once registered - [default: "cml-byujx1p0jz"] - --retry Automatically retries a run which jobs did not - finish due to runner disposal or reached 72 hours - in Github [boolean] [default: false] + cml-{ID} + --retry Automatically retry jobs terminated due to runner + disposal or timeout (72 hours on Github) + [boolean] [default: false] --single Exit after running a single job [boolean] [default: false] --reuse Don't launch a new runner if an existing one has @@ -498,6 +498,7 @@ Options: --cloud-startup-script Run the provided Base64-encoded Linux shell script during the instance initialization [default: ""] + --cloud-aws-security-group Specifies the security group in AWS [default: ""] -h Show help [boolean] ``` diff --git a/bin/cml-runner.js b/bin/cml-runner.js index acff4e86d..fe5c57821 100755 --- a/bin/cml-runner.js +++ b/bin/cml-runner.js @@ -33,6 +33,7 @@ let RUNNER; let RUNNER_TIMEOUT_TIMER = 0; let RUNNER_SHUTTING_DOWN = false; let RUNNER_JOBS_RUNNING = []; +const GH_5_MIN_TIMEOUT = (72 * 60 - 5) * 60 * 1000; const shutdown = async (opts) => { if (RUNNER_SHUTTING_DOWN) return; @@ -259,10 +260,9 @@ const runLocal = async (opts) => { if (retry && cml.driver === 'github') { const watcher = setInterval(() => { RUNNER_JOBS_RUNNING.forEach((job) => { - const seventyTwoMinusFive = 72 * 60 * 60 * 1000 - 5 * 60 * 1000; if ( new Date().getTime() - new Date(job.date).getTime() > - seventyTwoMinusFive + GH_5_MIN_TIMEOUT ) shutdown(opts) && clearInterval(watcher); }); @@ -351,14 +351,14 @@ const opts = yargs 'idle-timeout', 'Time in seconds for the runner to be waiting for jobs before shutting down. Setting it to 0 disables automatic shutdown' ) - .default('name', RUNNER_NAME) - .describe('name', 'Name displayed in the repository once registered') - + .default('name') + .describe('name', 'Name displayed in the repository once registered cml-{ID}') + .coerce('name', (val) => val || RUNNER_NAME) .boolean('retry') .default('retry', RUNNER_RETRY) .describe( 'retry', - 'Automatically retries a run which jobs did not finish due to runner disposal or reached 72 hours in Github' + 'Automatically retry jobs terminated due to runner disposal or timeout (72 hours on Github)' ) .boolean('single') .default('single', RUNNER_SINGLE) From 22679ea8ab118433571e2f4d368eadcd839f7b37 Mon Sep 17 00:00:00 2001 From: davidgortega Date: Fri, 18 Jun 2021 17:04:19 +0200 Subject: [PATCH 08/12] avoid enqueue runs --- src/drivers/github.js | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/src/drivers/github.js b/src/drivers/github.js index 8906cecaa..08c904ea9 100644 --- a/src/drivers/github.js +++ b/src/drivers/github.js @@ -375,13 +375,23 @@ class Github { // HANDLES: Cannot cancel a workflow run that is completed. } - try { - await actions.reRunWorkflow({ - owner, - repo, - run_id - }); - } catch (err) {} + const { + data: { status } + } = await actions.getWorkflowRun({ + owner, + repo, + run_id + }); + + if (status !== 'queued') { + try { + await actions.reRunWorkflow({ + owner, + repo, + run_id + }); + } catch (err) {} + } } get sha() { From 176da2836824fa01d145f9e27601e2b0c5b38b18 Mon Sep 17 00:00:00 2001 From: DavidGOrtega Date: Tue, 29 Jun 2021 12:14:57 +0000 Subject: [PATCH 09/12] fix job vs id --- bin/cml-runner.js | 3 ++- src/drivers/github.js | 18 +++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/bin/cml-runner.js b/bin/cml-runner.js index fe5c57821..f8d0443c6 100755 --- a/bin/cml-runner.js +++ b/bin/cml-runner.js @@ -226,10 +226,11 @@ const runLocal = async (opts) => { RUNNER_TIMEOUT_TIMER = 0; } else if (log && log.status === 'job_ended') { const { job } = log; + if (!RUNNER_SHUTTING_DOWN) { const jobs = job ? [job] - : (await cml.pipelineJobs({ ids: RUNNER_JOBS_RUNNING })) + : (await cml.pipelineJobs({ jobs: RUNNER_JOBS_RUNNING })) .filter((job) => job.status === 'completed') .map((job) => job.id); diff --git a/src/drivers/github.js b/src/drivers/github.js index 08c904ea9..c4d8f6cbd 100644 --- a/src/drivers/github.js +++ b/src/drivers/github.js @@ -289,16 +289,16 @@ class Github { } async pipelineJobs(opts = {}) { - const { ids } = opts; + const { jobs: runnerJobs } = opts; const { owner, repo } = ownerRepo({ uri: this.repo }); const { actions } = octokit(this.token, this.repo); const jobs = await Promise.all( - ids.map(async (id) => { + runnerJobs.map(async (job) => { const { data } = await actions.getJobForWorkflowRun({ owner, repo, - job_id: id + job_id: job.id }); return data; @@ -353,23 +353,23 @@ class Github { } async pipelineRestart(opts = {}) { - const { jobId: job_id } = opts; + const { jobId } = opts; const { owner, repo } = ownerRepo({ uri: this.repo }); const { actions } = octokit(this.token, this.repo); const { - data: { run_id } + data: { run_id: runId } } = await actions.getJobForWorkflowRun({ owner, repo, - job_id + job_id: jobId }); try { await actions.cancelWorkflowRun({ owner, repo, - run_id + run_id: runId }); } catch (err) { // HANDLES: Cannot cancel a workflow run that is completed. @@ -380,7 +380,7 @@ class Github { } = await actions.getWorkflowRun({ owner, repo, - run_id + run_id: runId }); if (status !== 'queued') { @@ -388,7 +388,7 @@ class Github { await actions.reRunWorkflow({ owner, repo, - run_id + run_id: runId }); } catch (err) {} } From c83679fef3639fcd97f2e54b049ea83c01991c31 Mon Sep 17 00:00:00 2001 From: davidgortega Date: Sun, 4 Jul 2021 23:33:58 +0200 Subject: [PATCH 10/12] no-rety --- bin/cml-runner.js | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/bin/cml-runner.js b/bin/cml-runner.js index f8d0443c6..32a81a25c 100755 --- a/bin/cml-runner.js +++ b/bin/cml-runner.js @@ -22,7 +22,7 @@ const { RUNNER_NAME = NAME, RUNNER_SINGLE = false, RUNNER_REUSE = false, - RUNNER_RETRY = false, + RUNNER_NO_RETRY = false, RUNNER_DRIVER, RUNNER_REPO, REPO_TOKEN @@ -41,7 +41,7 @@ const shutdown = async (opts) => { RUNNER_SHUTTING_DOWN = true; let { error, cloud } = opts; - const { name, workdir = '', tfResource, retry } = opts; + const { name, workdir = '', tfResource, noRetry } = opts; const tfPath = workdir; console.log( @@ -88,7 +88,7 @@ const shutdown = async (opts) => { await sleep(RUNNER_DESTROY_DELAY); try { - if (retry && RUNNER_JOBS_RUNNING.length) { + if (!noRetry && RUNNER_JOBS_RUNNING.length) { await Promise.all( RUNNER_JOBS_RUNNING.map( async (job) => await cml.pipelineRestart({ jobId: job.id }) @@ -207,7 +207,7 @@ const runCloud = async (opts) => { const runLocal = async (opts) => { console.log(`Launching ${cml.driver} runner`); - const { workdir, name, labels, single, idleTimeout, retry } = opts; + const { workdir, name, labels, single, idleTimeout, noRetry } = opts; const proc = await cml.startRunner({ workdir, @@ -258,7 +258,7 @@ const runLocal = async (opts) => { }, 1000); } - if (retry && cml.driver === 'github') { + if (!noRetry && cml.driver === 'github') { const watcher = setInterval(() => { RUNNER_JOBS_RUNNING.forEach((job) => { if ( @@ -355,11 +355,11 @@ const opts = yargs .default('name') .describe('name', 'Name displayed in the repository once registered cml-{ID}') .coerce('name', (val) => val || RUNNER_NAME) - .boolean('retry') - .default('retry', RUNNER_RETRY) + .boolean('no-retry') + .default('no-retry', RUNNER_NO_RETRY) .describe( - 'retry', - 'Automatically retry jobs terminated due to runner disposal or timeout (72 hours on Github)' + 'no-retry', + 'Do not restart workflow terminated due to instance disposal or GitHub Actions timeout' ) .boolean('single') .default('single', RUNNER_SINGLE) From 5e948ef04970c75fe116fa26767fa2d506205c74 Mon Sep 17 00:00:00 2001 From: davidgortega Date: Mon, 5 Jul 2021 02:50:22 +0200 Subject: [PATCH 11/12] remove unused error --- bin/cml-runner.js | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/bin/cml-runner.js b/bin/cml-runner.js index ac280f4ad..9addc7cce 100755 --- a/bin/cml-runner.js +++ b/bin/cml-runner.js @@ -17,7 +17,7 @@ const { RUNNER_PATH = `${WORKDIR_BASE}/${NAME}`, RUNNER_IDLE_TIMEOUT = 5 * 60, - RUNNER_DESTROY_DELAY = 10, + RUNNER_DESTROY_DELAY = 15, RUNNER_LABELS = 'cml', RUNNER_NAME = NAME, RUNNER_SINGLE = false, @@ -37,28 +37,22 @@ const GH_5_MIN_TIMEOUT = (72 * 60 - 5) * 60 * 1000; const shutdown = async (opts) => { if (RUNNER_SHUTTING_DOWN) return; - RUNNER_SHUTTING_DOWN = true; - let { error, cloud } = opts; + const { error, cloud } = opts; const { name, workdir = '', tfResource, noRetry } = opts; const tfPath = workdir; - console.log( - JSON.stringify({ level: error ? 'error' : 'info', status: 'terminated' }) - ); - if (error) console.error(error); - const unregisterRunner = async () => { if (!RUNNER) return; try { console.log(`Unregistering runner ${name}...`); + RUNNER && RUNNER.kill('SIGINT'); await cml.unregisterRunner({ name }); console.log('\tSuccess'); } catch (err) { - console.error('\tFailed'); - error = err; + console.error(`\tFailed: ${err.message}`); } }; @@ -72,7 +66,7 @@ const shutdown = async (opts) => { ); } } catch (err) { - console.log(err); + console.error(err); } }; @@ -87,7 +81,6 @@ const shutdown = async (opts) => { await exec(`echo y | docker-machine rm ${DOCKER_MACHINE}`); } catch (err) { console.error(`\tFailed shutting down docker machine: ${err.message}`); - error = err; } }; @@ -98,21 +91,23 @@ const shutdown = async (opts) => { console.log(await tf.destroy({ dir: tfPath })); } catch (err) { console.error(`\tFailed destroying terraform: ${err.message}`); - error = err; } }; + console.log( + JSON.stringify({ level: error ? 'error' : 'info', status: 'terminated' }) + ); + if (error) console.error(error); + await sleep(RUNNER_DESTROY_DELAY); + if (cloud) { await destroyTerraform(); } else { await unregisterRunner(); await retryWorkflows(); - if (DOCKER_MACHINE || tfResource) await sleep(RUNNER_DESTROY_DELAY); await destroyDockerMachine(); await destroyTerraform(); - - RUNNER && RUNNER.kill('SIGINT'); } process.exit(error ? 1 : 0); From 96743a4df929da6fef32161f0c09643bde998b89 Mon Sep 17 00:00:00 2001 From: DavidGOrtega Date: Mon, 5 Jul 2021 00:56:21 +0000 Subject: [PATCH 12/12] docs --- README.md | 86 ++++++++++++++++++++++------------------------- bin/cml-runner.js | 2 +- 2 files changed, 42 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index 5aed6ed7e..21dae0277 100644 --- a/README.md +++ b/README.md @@ -475,59 +475,55 @@ For example, `docker://dvcorg/cml:0-dvc2-base1-gpu`, or The `cml-runner` function accepts the following arguments: ``` -Usage: cml-runner.js +Usage: cml-runner Options: - --version Show version number [boolean] - --labels One or more user-defined labels for this runner - (delimited with commas) [default: "cml"] - --idle-timeout Time in seconds for the runner to be waiting for - jobs before shutting down. Setting it to 0 - disables automatic shutdown [default: 300] - --name Name displayed in the repository once registered - cml-{ID} - --retry Automatically retry jobs terminated due to runner - disposal or timeout (72 hours on Github) + --version Show version number [boolean] + --labels One or more user-defined labels for this runner + (delimited with commas) [default: "cml"] + --idle-timeout Time in seconds for the runner to be waiting for + jobs before shutting down. Setting it to 0 + disables automatic shutdown [default: 300] + --name Name displayed in the repository once registered + cml-{ID} + --no-retry Do not restart workflow terminated due to instance + disposal or GitHub Actions timeout [boolean] [default: false] - --single Exit after running a single job + --single Exit after running a single job [boolean] [default: false] - --reuse Don't launch a new runner if an existing one has - the same name or overlapping labels + --reuse Don't launch a new runner if an existing one has + the same name or overlapping labels [boolean] [default: false] - --driver Platform where the repository is hosted. If not - specified, it will be inferred from the - environment [choices: "github", "gitlab"] - --repo Repository to be used for registering the runner. - If not specified, it will be inferred from the - environment - --token Personal access token to register a self-hosted - runner on the repository. If not specified, it - will be inferred from the environment - --cloud Cloud to deploy the runner + --driver Platform where the repository is hosted. If not + specified, it will be inferred from the + environment [choices: "github", "gitlab"] + --repo Repository to be used for registering the runner. + If not specified, it will be inferred from the + environment + --token Personal access token to register a self-hosted + runner on the repository. If not specified, it + will be inferred from the environment + --cloud Cloud to deploy the runner [choices: "aws", "azure", "kubernetes"] - --cloud-region Region where the instance is deployed. Choices: - [us-east, us-west, eu-west, eu-north]. Also - accepts native cloud regions [default: "us-west"] - --cloud-type Instance type. Choices: [m, l, xl]. Also supports - native types like i.e. t2.micro - --cloud-gpu GPU type. + --cloud-region Region where the instance is deployed. Choices: + [us-east, us-west, eu-west, eu-north]. Also + accepts native cloud regions [default: "us-west"] + --cloud-type Instance type. Choices: [m, l, xl]. Also supports + native types like i.e. t2.micro + --cloud-gpu GPU type. [choices: "nogpu", "k80", "v100", "tesla"] - --cloud-hdd-size HDD size in GB - --cloud-ssh-private Custom private RSA SSH key. If not provided an - automatically generated throwaway key will be - used [default: ""] - --cloud-ssh-private-visible Show the private SSH key in the output with the - rest of the instance properties (not recommended) - [boolean] - --cloud-spot Request a spot instance [boolean] - --cloud-spot-price Maximum spot instance bidding price in USD. - Defaults to the current spot bidding price - [default: "-1"] - --cloud-startup-script Run the provided Base64-encoded Linux shell - script during the instance initialization + --cloud-hdd-size HDD size in GB + --cloud-ssh-private Custom private RSA SSH key. If not provided an + automatically generated throwaway key will be used [default: ""] - --cloud-aws-security-group Specifies the security group in AWS [default: ""] - -h Show help [boolean] + --cloud-spot Request a spot instance [boolean] + --cloud-spot-price Maximum spot instance bidding price in USD. + Defaults to the current spot bidding price + [default: "-1"] + --cloud-startup-script Run the provided Base64-encoded Linux shell script + during the instance initialization [default: ""] + --cloud-aws-security-group Specifies the security group in AWS [default: ""] + -h Show help [boolean] ``` #### Environment Variables diff --git a/bin/cml-runner.js b/bin/cml-runner.js index 9addc7cce..2978605c6 100755 --- a/bin/cml-runner.js +++ b/bin/cml-runner.js @@ -17,7 +17,7 @@ const { RUNNER_PATH = `${WORKDIR_BASE}/${NAME}`, RUNNER_IDLE_TIMEOUT = 5 * 60, - RUNNER_DESTROY_DELAY = 15, + RUNNER_DESTROY_DELAY = 20, RUNNER_LABELS = 'cml', RUNNER_NAME = NAME, RUNNER_SINGLE = false,