Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/567 metrics to identify stuck jobs #582

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,13 @@ public List<MetricFamilySamples> collect() {

List<MetricCollector<LoadStatistics.LoadStatisticsSnapshot, ? extends Collector>> collectors = new ArrayList<>();

collectors.add(factory.createExecutorCollector(CollectorType.EXECUTORS_AVAILABLE_GAUGE, labelNameArray, prefix));
collectors.add(factory.createExecutorCollector(CollectorType.EXECUTORS_BUSY_GAUGE, labelNameArray, prefix));
collectors.add(factory.createExecutorCollector(CollectorType.EXECUTORS_CONNECTING_GAUGE, labelNameArray, prefix));
collectors.add(factory.createExecutorCollector(CollectorType.EXECUTORS_DEFINED_GAUGE, labelNameArray, prefix));
collectors.add(factory.createExecutorCollector(CollectorType.EXECUTORS_IDLE_GAUGE, labelNameArray, prefix));
collectors.add(factory.createExecutorCollector(CollectorType.EXECUTORS_ONLINE_GAUGE, labelNameArray, prefix));
collectors.add(factory.createExecutorCollector(CollectorType.EXECUTORS_QUEUE_LENGTH_GAUGE, labelNameArray, prefix));
collectors.add(factory.createLoadStatisticsCollector(CollectorType.EXECUTORS_AVAILABLE_GAUGE, labelNameArray, prefix));
collectors.add(factory.createLoadStatisticsCollector(CollectorType.EXECUTORS_BUSY_GAUGE, labelNameArray, prefix));
collectors.add(factory.createLoadStatisticsCollector(CollectorType.EXECUTORS_CONNECTING_GAUGE, labelNameArray, prefix));
collectors.add(factory.createLoadStatisticsCollector(CollectorType.EXECUTORS_DEFINED_GAUGE, labelNameArray, prefix));
collectors.add(factory.createLoadStatisticsCollector(CollectorType.EXECUTORS_IDLE_GAUGE, labelNameArray, prefix));
collectors.add(factory.createLoadStatisticsCollector(CollectorType.EXECUTORS_ONLINE_GAUGE, labelNameArray, prefix));
collectors.add(factory.createLoadStatisticsCollector(CollectorType.EXECUTORS_QUEUE_LENGTH_GAUGE, labelNameArray, prefix));

LOGGER.debug("getting load statistics for Executors");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
private MetricCollector<Job<?, ?>, ? extends Collector> nbBuildsGauge;
private MetricCollector<Job<?, ?>, ? extends Collector> buildDiscardGauge;
private MetricCollector<Job<?, ?>, ? extends Collector> currentRunDurationGauge;
private MetricCollector<Job<?,?>, ? extends Collector> logUpdatedGauge;

private static class BuildMetrics {

Expand Down Expand Up @@ -129,6 +130,8 @@

currentRunDurationGauge = factory.createJobCollector(CollectorType.CURRENT_RUN_DURATION_GAUGE, labelBaseNameArray);

logUpdatedGauge = factory.createJobCollector(CollectorType.JOB_LOG_UPDATED_GAUGE, labelBaseNameArray);

if (PrometheusConfiguration.get().isPerBuildMetrics()) {
labelNameArray = Arrays.copyOf(labelNameArray, labelNameArray.length + 1);
labelNameArray[labelNameArray.length - 1] = "number";
Expand Down Expand Up @@ -164,6 +167,7 @@
addSamples(samples, nbBuildsGauge.collect(), "Adding [{}] samples from gauge ({})");
addSamples(samples, buildDiscardGauge.collect(), "Adding [{}] samples from gauge ({})");
addSamples(samples, currentRunDurationGauge.collect(), "Adding [{}] samples from gauge ({})");
addSamples(samples, logUpdatedGauge.collect(), "Adding [{}] samples from gauge ({})");
addSamples(samples, lastBuildMetrics);
if (PrometheusConfiguration.get().isPerBuildMetrics()) {
addSamples(samples, perBuildMetrics);
Expand Down Expand Up @@ -217,6 +221,8 @@
jobHealthScoreGauge.calculateMetric(job, baseLabelValueArray);
buildDiscardGauge.calculateMetric(job, baseLabelValueArray);
currentRunDurationGauge.calculateMetric(job, baseLabelValueArray);
logUpdatedGauge.calculateMetric(job, baseLabelValueArray);

Check warning on line 224 in src/main/java/org/jenkinsci/plugins/prometheus/JobCollector.java

View check run for this annotation

ci.jenkins.io / Code Coverage

Not covered line

Line 224 is not covered by tests

processRun(job, lastBuild, baseLabelValueArray, lastBuildMetrics);

Run<?, ?> run = lastBuild;
Expand Down
54 changes: 54 additions & 0 deletions src/main/java/org/jenkinsci/plugins/prometheus/NodeCollector.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package org.jenkinsci.plugins.prometheus;

import hudson.model.Computer;
import hudson.model.Executor;
import hudson.model.Node;
import io.prometheus.client.Collector;
import jenkins.model.Jenkins;
import org.jenkinsci.plugins.prometheus.collectors.CollectorFactory;
import org.jenkinsci.plugins.prometheus.collectors.CollectorType;
import org.jenkinsci.plugins.prometheus.collectors.MetricCollector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.Collection;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;

public class NodeCollector extends Collector {

private static final Logger LOGGER = LoggerFactory.getLogger(NodeCollector.class);

@Override
public List<MetricFamilySamples> collect() {
LOGGER.debug("Collecting node metrics for prometheus");
String[] labelNameArray = {"computerName"};

CollectorFactory factory = new CollectorFactory();

MetricCollector<Executor, ? extends Collector> likelyStuckCollector = factory.createExecutorStatisticsCollector(CollectorType.EXECUTOR_LIKELY_STUCK_GAUGE, labelNameArray);

List<? extends MetricCollector<Executor, ? extends Collector>> collectors = List.of(likelyStuckCollector);

List<Computer> computers = Jenkins.get().getNodes().parallelStream()
.map(Node::toComputer)
.filter(Objects::nonNull)
.collect(Collectors.toList());

for (Computer computer : computers) {

Check warning on line 39 in src/main/java/org/jenkinsci/plugins/prometheus/NodeCollector.java

View check run for this annotation

ci.jenkins.io / Code Coverage

Partially covered line

Line 39 is only partially covered, one branch is missing
String computerName = computer.getName();
List<Executor> executors = computer.getExecutors();
if (executors != null) {
for (Executor ex : executors) {
likelyStuckCollector.calculateMetric(ex, new String[]{computerName});
}
}
}

Check warning on line 47 in src/main/java/org/jenkinsci/plugins/prometheus/NodeCollector.java

View check run for this annotation

ci.jenkins.io / Code Coverage

Not covered lines

Lines 40-47 are not covered by tests

return collectors.stream()
.map(MetricCollector::collect)
.flatMap(Collection::stream)
.collect(Collectors.toList());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import com.cloudbees.simplediskusage.DiskItem;
import com.cloudbees.simplediskusage.JobDiskItem;
import hudson.model.Executor;
import hudson.model.Job;
import hudson.model.LoadStatistics;
import hudson.model.Run;
Expand All @@ -13,7 +14,7 @@
import org.jenkinsci.plugins.prometheus.collectors.executors.ExecutorCollectorFactory;
import org.jenkinsci.plugins.prometheus.collectors.jenkins.JenkinsCollectorFactory;
import org.jenkinsci.plugins.prometheus.collectors.jobs.JobCollectorFactory;
import org.json.Cookie;
import org.jenkinsci.plugins.prometheus.collectors.nodes.NodeCollectorFactory;

import java.nio.file.FileStore;

Expand All @@ -27,13 +28,16 @@ public class CollectorFactory {

private final CoverageCollectorFactory coverageCollectorFactory;

private final NodeCollectorFactory nodeCollectorFactory;

public CollectorFactory() {
buildCollectorFactory = new BuildCollectorFactory();
jobCollectorFactory = new JobCollectorFactory();
jenkinsCollectorFactory = new JenkinsCollectorFactory();
executorCollectorFactory = new ExecutorCollectorFactory();
diskCollectorFactory = new DiskCollectorFactory();
coverageCollectorFactory = new CoverageCollectorFactory();
nodeCollectorFactory = new NodeCollectorFactory();
}

public MetricCollector<Run<?,?>, ? extends Collector> createCoverageRunCollector(CollectorType type, String[] labelNames) {
Expand All @@ -48,11 +52,15 @@ public CollectorFactory() {
return jobCollectorFactory.createCollector(type, labelNames);
}

public MetricCollector<Executor, ? extends Collector> createExecutorStatisticsCollector(CollectorType type, String[] labelNames) {
return nodeCollectorFactory.createExecutorCollector(type, labelNames);
}

public MetricCollector<Jenkins, ? extends Collector> createJenkinsCollector(CollectorType type, String[] labelNames) {
return jenkinsCollectorFactory.createCollector(type, labelNames);
}

public MetricCollector<LoadStatistics.LoadStatisticsSnapshot, ? extends Collector> createExecutorCollector(CollectorType type, String[] labelNames, String prefix) {
public MetricCollector<LoadStatistics.LoadStatisticsSnapshot, ? extends Collector> createLoadStatisticsCollector(CollectorType type, String[] labelNames, String prefix) {
return executorCollectorFactory.createCollector(type, labelNames, prefix);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,8 @@ public enum CollectorType {
COVERAGE_FILE_MISSED("coverage_file_missed"),
COVERAGE_FILE_TOTAL("coverage_file_total"),



;
JOB_LOG_UPDATED_GAUGE("job_log_updated"),
EXECUTOR_LIKELY_STUCK_GAUGE("likely_stuck");

private final String name;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ public JobCollectorFactory() {
return saveBuildCollector(new BuildDiscardGauge(labelNames, namespace, subsystem));
case CURRENT_RUN_DURATION_GAUGE:
return saveBuildCollector(new CurrentRunDurationGauge(labelNames, namespace, subsystem));
case JOB_LOG_UPDATED_GAUGE:
return saveBuildCollector(new LogUpdatedGauge(labelNames, namespace, subsystem));
default:
return new NoOpMetricCollector<>();
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package org.jenkinsci.plugins.prometheus.collectors.jobs;

import hudson.model.Job;
import io.prometheus.client.Gauge;
import io.prometheus.client.SimpleCollector;
import org.jenkinsci.plugins.prometheus.collectors.CollectorType;
import org.jenkinsci.plugins.prometheus.collectors.builds.BuildsMetricCollector;

public class LogUpdatedGauge extends BuildsMetricCollector<Job<?, ?>, Gauge> {

protected LogUpdatedGauge(String[] labelNames, String namespace, String subsystem) {
super(labelNames, namespace, subsystem);
}

@Override
protected CollectorType getCollectorType() {
return CollectorType.JOB_LOG_UPDATED_GAUGE;
}

@Override
protected String getHelpText() {
return "Provides a hint if a job is still logging. Maybe not 100% accurate - but a good hint.";
}

@Override
protected SimpleCollector.Builder<?, Gauge> getCollectorBuilder() {
return Gauge.build();
}

@Override
public void calculateMetric(Job<?, ?> jenkinsObject, String[] labelValues) {

if (jenkinsObject != null) {

Check warning on line 33 in src/main/java/org/jenkinsci/plugins/prometheus/collectors/jobs/LogUpdatedGauge.java

View check run for this annotation

ci.jenkins.io / Code Coverage

Partially covered line

Line 33 is only partially covered, one branch is missing
boolean logUpdated = jenkinsObject.isLogUpdated();
this.collector.labels(labelValues).set(logUpdated ? 1.0 : 0.0);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package org.jenkinsci.plugins.prometheus.collectors.nodes;

import hudson.model.Executor;
import io.prometheus.client.Gauge;
import io.prometheus.client.SimpleCollector;
import org.jenkinsci.plugins.prometheus.collectors.BaseMetricCollector;
import org.jenkinsci.plugins.prometheus.collectors.CollectorType;

public class ExecutorLikelyStuckGauge extends BaseMetricCollector<Executor, Gauge> {

protected ExecutorLikelyStuckGauge(String[] labelNames, String namespace, String subsystem) {
super(labelNames, namespace, subsystem);
}

@Override
protected CollectorType getCollectorType() {
return CollectorType.EXECUTOR_LIKELY_STUCK_GAUGE;
}

@Override
protected String getHelpText() {
return "Returns an indication if an executor of a node is likely stuck";
}

@Override
protected SimpleCollector.Builder<?, Gauge> getCollectorBuilder() {
return Gauge.build();
}

@Override
public void calculateMetric(Executor executor, String[] labelValues) {
if (executor == null) {
return;
}
boolean likelyStuck = executor.isLikelyStuck();
collector.labels(labelValues).set(likelyStuck ? 1.0 : 0.0);
}

Check warning on line 37 in src/main/java/org/jenkinsci/plugins/prometheus/collectors/nodes/ExecutorLikelyStuckGauge.java

View check run for this annotation

ci.jenkins.io / Code Coverage

Not covered lines

Lines 32-37 are not covered by tests

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package org.jenkinsci.plugins.prometheus.collectors.nodes;

import hudson.model.Executor;
import org.jenkinsci.plugins.prometheus.collectors.BaseCollectorFactory;
import org.jenkinsci.plugins.prometheus.collectors.CollectorType;
import org.jenkinsci.plugins.prometheus.collectors.MetricCollector;
import org.jenkinsci.plugins.prometheus.collectors.NoOpMetricCollector;

import java.util.stream.Collector;

public class NodeCollectorFactory extends BaseCollectorFactory {

public NodeCollectorFactory(){super();}

Check warning

Code scanning / Pmd (reported by Codacy)

Avoid unnecessary constructors - the compiler will generate these for you Warning

Avoid unnecessary constructors - the compiler will generate these for you

public MetricCollector<Executor, ? extends Collector> createExecutorCollector(CollectorType type, String[] labelNames) {
switch (type) {

Check warning on line 16 in src/main/java/org/jenkinsci/plugins/prometheus/collectors/nodes/NodeCollectorFactory.java

View check run for this annotation

ci.jenkins.io / Code Coverage

Partially covered line

Line 16 is only partially covered, one branch is missing
case EXECUTOR_LIKELY_STUCK_GAUGE:
return saveBuildCollector(new ExecutorLikelyStuckGauge(labelNames, namespace, subsystem));
default:
return new NoOpMetricCollector<>();

Check warning on line 20 in src/main/java/org/jenkinsci/plugins/prometheus/collectors/nodes/NodeCollectorFactory.java

View check run for this annotation

ci.jenkins.io / Code Coverage

Not covered line

Line 20 is not covered by tests
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ public DefaultPrometheusMetrics() {
collectorRegistry.register(new DropwizardExports(Metrics.metricRegistry(), new JenkinsNodeBuildsSampleBuilder()));
collectorRegistry.register(new DiskUsageCollector());
collectorRegistry.register(new ExecutorCollector());
collectorRegistry.register(new NodeCollector());
collectorRegistry.register(new CodeCoverageCollector());

// other collectors from other plugins
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

public class BuildDiscardGaugeTest extends JobCollectorTest {

@Override
@Test
void testCollectResult() {
when(job.getBuildDiscarder()).thenReturn(null);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ public class CurrentRunDurationGaugeTest extends JobCollectorTest {
@Mock
Run currentRun;

@Override
@Test
public void testCollectResult() {
when(currentRun.isBuilding()).thenReturn(true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

public class HealthScoreGaugeTest extends JobCollectorTest {

@Override
@Test
public void testCollectResult() {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,4 @@ public abstract class JobCollectorTest extends CollectorTest {
protected Job job;


abstract void testCollectResult();


}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package org.jenkinsci.plugins.prometheus.collectors.jobs;

import io.prometheus.client.Collector;
import org.junit.Test;

import java.util.List;

import static org.mockito.Mockito.when;

public class LogUpdatedGaugeTest extends JobCollectorTest {


@Test
public void testBasicAttributes() {

Check notice

Code scanning / Pmd (reported by Codacy)

JUnit tests should include assert() or fail() Note test

JUnit tests should include assert() or fail()
when(job.isLogUpdated()).thenReturn(true);

LogUpdatedGauge sut = new LogUpdatedGauge(new String[]{"jenkins_job", "repo"}, "default", "jenkins");

sut.calculateMetric(job, new String[]{"job1", "NA"});
List<Collector.MetricFamilySamples> collect = sut.collect();

validateMetricFamilySampleListSize(collect, 1);

Collector.MetricFamilySamples samples = collect.get(0);
validateNames(samples, new String[]{"default_jenkins_builds_job_log_updated"});
validateMetricFamilySampleSize(samples, 1);

}

@Test
public void testLogIsUpdatedReturnsOne() {

Check notice

Code scanning / Pmd (reported by Codacy)

JUnit tests should include assert() or fail() Note test

JUnit tests should include assert() or fail()

when(job.isLogUpdated()).thenReturn(true);

LogUpdatedGauge sut = new LogUpdatedGauge(new String[]{"jenkins_job", "repo"}, "default", "jenkins");

sut.calculateMetric(job, new String[]{"job1", "NA"});
List<Collector.MetricFamilySamples> collect = sut.collect();
Collector.MetricFamilySamples samples = collect.get(0);
validateValue(samples.samples.get(0), 1.0);
}

@Test
public void testLogIsNotUpdatedReturnsZero() {

Check notice

Code scanning / Pmd (reported by Codacy)

JUnit tests should include assert() or fail() Note test

JUnit tests should include assert() or fail()

when(job.isLogUpdated()).thenReturn(false);

LogUpdatedGauge sut = new LogUpdatedGauge(new String[]{"jenkins_job", "repo"}, "default", "jenkins");

sut.calculateMetric(job, new String[]{"job1", "NA"});
List<Collector.MetricFamilySamples> collect = sut.collect();
Collector.MetricFamilySamples samples = collect.get(0);
validateValue(samples.samples.get(0), 0.0);
}
}
Loading