summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBjørn Meland <bjormel@users.noreply.github.com>2020-07-21 11:08:37 +0200
committerGitHub <noreply@github.com>2020-07-21 11:08:37 +0200
commitf4640a992be45eceba4b7eebf1b1eeb6d03fe39d (patch)
treee7f4b6d7b3051900f52b9ee5e0827ccb65fc869a
parentcb81d2e33535db9e45c461630319c41dbf79ff0d (diff)
parentaa5768c42fd854c9466baf06d70867bec4531298 (diff)
Merge pull request #13928 from vespa-engine/mpolden/measure-consecutive-failures
Measure consecutive maintenance failures
-rw-r--r--configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintainer.java10
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java10
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainerTest.java48
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java12
-rw-r--r--vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java24
-rw-r--r--vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java1
-rw-r--r--vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java30
-rw-r--r--vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java3
8 files changed, 72 insertions, 66 deletions
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintainer.java b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintainer.java
index 007ca8dcf53..5854b1d85da 100644
--- a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintainer.java
+++ b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintainer.java
@@ -14,7 +14,6 @@ import com.yahoo.vespa.flags.FlagSource;
import com.yahoo.vespa.flags.Flags;
import com.yahoo.vespa.flags.ListFlag;
-import java.time.Clock;
import java.time.Duration;
import java.util.Map;
import java.util.Set;
@@ -31,14 +30,13 @@ public abstract class ConfigServerMaintainer extends Maintainer {
ConfigServerMaintainer(ApplicationRepository applicationRepository, Curator curator, FlagSource flagSource,
Duration initialDelay, Duration interval) {
super(null, interval, initialDelay, new JobControl(new JobControlFlags(curator, flagSource)),
- jobMetrics(applicationRepository.clock(), applicationRepository.metric()));
+ jobMetrics(applicationRepository.metric()));
this.applicationRepository = applicationRepository;
}
- private static JobMetrics jobMetrics(Clock clock, Metric metric) {
- return new JobMetrics(clock, (job, instant) -> {
- Duration sinceSuccess = Duration.between(instant, clock.instant());
- metric.set("maintenance.secondsSinceSuccess", sinceSuccess.getSeconds(), metric.createContext(Map.of("job", job)));
+ private static JobMetrics jobMetrics(Metric metric) {
+ return new JobMetrics((job, consecutiveFailures) -> {
+ metric.set("maintenance.consecutiveFailures", consecutiveFailures, metric.createContext(Map.of("job", job)));
});
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java
index 76003a873fe..9bf6352813a 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java
@@ -7,7 +7,6 @@ import com.yahoo.config.provision.SystemName;
import com.yahoo.jdisc.Metric;
import com.yahoo.vespa.hosted.controller.Controller;
-import java.time.Clock;
import java.time.Duration;
import java.util.EnumSet;
import java.util.Map;
@@ -35,7 +34,7 @@ public abstract class ControllerMaintainer extends Maintainer {
public ControllerMaintainer(Controller controller, Duration interval, String name, Set<SystemName> activeSystems) {
super(name, interval, controller.clock().instant(), controller.jobControl(),
- jobMetrics(controller.clock(), controller.metric()), controller.curator().cluster());
+ jobMetrics(controller.metric()), controller.curator().cluster());
this.controller = controller;
this.activeSystems = Set.copyOf(Objects.requireNonNull(activeSystems));
}
@@ -48,10 +47,9 @@ public abstract class ControllerMaintainer extends Maintainer {
super.run();
}
- private static JobMetrics jobMetrics(Clock clock, Metric metric) {
- return new JobMetrics(clock, (job, instant) -> {
- Duration sinceSuccess = Duration.between(instant, clock.instant());
- metric.set("maintenance.secondsSinceSuccess", sinceSuccess.getSeconds(), metric.createContext(Map.of("job", job)));
+ private static JobMetrics jobMetrics(Metric metric) {
+ return new JobMetrics((job, consecutiveFailures) -> {
+ metric.set("maintenance.consecutiveFailures", consecutiveFailures, metric.createContext(Map.of("job", job)));
});
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainerTest.java
index 4218e66703f..6a2feba1d47 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainerTest.java
@@ -2,6 +2,7 @@
package com.yahoo.vespa.hosted.controller.maintenance;
import com.yahoo.config.provision.SystemName;
+import com.yahoo.vespa.hosted.controller.Controller;
import com.yahoo.vespa.hosted.controller.ControllerTester;
import com.yahoo.vespa.hosted.controller.integration.MetricsMock;
import org.junit.Before;
@@ -28,28 +29,47 @@ public class ControllerMaintainerTest {
@Test
public void only_runs_in_permitted_systems() {
AtomicInteger executions = new AtomicInteger();
- maintainerIn(SystemName.cd, executions).run();
- maintainerIn(SystemName.main, executions).run();
+ new TestControllerMaintainer(tester.controller(), SystemName.cd, executions).run();
+ new TestControllerMaintainer(tester.controller(), SystemName.main, executions).run();
assertEquals(1, executions.get());
}
@Test
public void records_metric() {
- maintainerIn(SystemName.main, new AtomicInteger()).run();
+ TestControllerMaintainer maintainer = new TestControllerMaintainer(tester.controller(), SystemName.main, new AtomicInteger());
+ maintainer.run();
+ assertEquals(0L, consecutiveFailuresMetric());
+ maintainer.success = false;
+ maintainer.run();
+ maintainer.run();
+ assertEquals(2L, consecutiveFailuresMetric());
+ maintainer.success = true;
+ maintainer.run();;
+ assertEquals(0, consecutiveFailuresMetric());
+ }
+
+ private long consecutiveFailuresMetric() {
MetricsMock metrics = (MetricsMock) tester.controller().metric();
- assertEquals(0L, metrics.getMetric((context) -> "MockMaintainer".equals(context.get("job")),
- "maintenance.secondsSinceSuccess").get());
+ return metrics.getMetric((context) -> "TestControllerMaintainer".equals(context.get("job")),
+ "maintenance.consecutiveFailures").get().longValue();
}
- private ControllerMaintainer maintainerIn(SystemName system, AtomicInteger executions) {
- return new ControllerMaintainer(tester.controller(), Duration.ofDays(1),
- "MockMaintainer", EnumSet.of(system)) {
- @Override
- protected boolean maintain() {
- executions.incrementAndGet();
- return true;
- }
- };
+ private static class TestControllerMaintainer extends ControllerMaintainer {
+
+ private final AtomicInteger executions;
+ private boolean success = true;
+
+ public TestControllerMaintainer(Controller controller, SystemName system, AtomicInteger executions) {
+ super(controller, Duration.ofDays(1), null, EnumSet.of(system));
+ this.executions = executions;
+ }
+
+ @Override
+ protected boolean maintain() {
+ executions.incrementAndGet();
+ return success;
+ }
+
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java
index 85477dad729..5f87cf9fd9b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java
@@ -9,7 +9,6 @@ import com.yahoo.jdisc.Metric;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
-import java.time.Clock;
import java.time.Duration;
import java.util.List;
import java.util.Map;
@@ -25,8 +24,8 @@ public abstract class NodeRepositoryMaintainer extends Maintainer {
private final NodeRepository nodeRepository;
public NodeRepositoryMaintainer(NodeRepository nodeRepository, Duration interval, Metric metric) {
- super(null, interval, nodeRepository.clock().instant(), nodeRepository.jobControl(),
- jobMetrics(nodeRepository.clock(), metric), nodeRepository.database().cluster());
+ super(null, interval, nodeRepository.clock().instant(), nodeRepository.jobControl(), jobMetrics(metric),
+ nodeRepository.database().cluster());
this.nodeRepository = nodeRepository;
}
@@ -45,10 +44,9 @@ public abstract class NodeRepositoryMaintainer extends Maintainer {
.collect(Collectors.groupingBy(node -> node.allocation().get().owner()));
}
- private static JobMetrics jobMetrics(Clock clock, Metric metric) {
- return new JobMetrics(clock, (job, instant) -> {
- Duration sinceSuccess = Duration.between(instant, clock.instant());
- metric.set("maintenance.secondsSinceSuccess", sinceSuccess.getSeconds(), metric.createContext(Map.of("job", job)));
+ private static JobMetrics jobMetrics(Metric metric) {
+ return new JobMetrics((job, consecutiveFailures) -> {
+ metric.set("maintenance.consecutiveFailures", consecutiveFailures, metric.createContext(Map.of("job", job)));
});
}
diff --git a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java
index 4c05d46d782..a43e2156025 100644
--- a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java
+++ b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java
@@ -1,10 +1,7 @@
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.concurrent.maintenance;
-import java.time.Clock;
-import java.time.Instant;
import java.util.Map;
-import java.util.Objects;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.BiConsumer;
@@ -15,26 +12,29 @@ import java.util.function.BiConsumer;
*/
public class JobMetrics {
- private final Clock clock;
- private final BiConsumer<String, Instant> metricConsumer;
+ private final BiConsumer<String, Long> metricConsumer;
- private final Map<String, Instant> successfulRuns = new ConcurrentHashMap<>();
+ private final Map<String, Long> incompleteRuns = new ConcurrentHashMap<>();
- public JobMetrics(Clock clock, BiConsumer<String, Instant> metricConsumer) {
- this.clock = Objects.requireNonNull(clock);
+ public JobMetrics(BiConsumer<String, Long> metricConsumer) {
this.metricConsumer = metricConsumer;
}
+ /** Record a run for given job */
+ public void recordRunOf(String job) {
+ incompleteRuns.compute(job, (ignored, run) -> run == null ? 1 : ++run);
+ }
+
/** Record successful run of given job */
public void recordSuccessOf(String job) {
- successfulRuns.put(job, clock.instant());
+ incompleteRuns.put(job, 0L);
}
/** Forward metrics for given job to metric consumer */
public void forward(String job) {
- Instant lastSuccess = successfulRuns.get(job);
- if (lastSuccess != null) {
- metricConsumer.accept(job, lastSuccess);
+ Long incompleteRuns = this.incompleteRuns.get(job);
+ if (incompleteRuns != null) {
+ metricConsumer.accept(job, incompleteRuns);
}
}
diff --git a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java
index 0385c27536d..eb9b91c812c 100644
--- a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java
+++ b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java
@@ -85,6 +85,7 @@ public abstract class Maintainer implements Runnable, AutoCloseable {
public final void lockAndMaintain() {
try (var lock = jobControl.lockJob(name())) {
try {
+ jobMetrics.recordRunOf(name());
if (maintain()) jobMetrics.recordSuccessOf(name());
} finally {
// Always forward metrics
diff --git a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java
index 47ed010e95e..2bfaad894a5 100644
--- a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java
+++ b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java
@@ -1,16 +1,14 @@
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.concurrent.maintenance;
-import com.yahoo.test.ManualClock;
import org.junit.Test;
import java.time.Duration;
import java.time.Instant;
import java.util.List;
-import java.util.concurrent.atomic.AtomicReference;
+import java.util.concurrent.atomic.AtomicLong;
import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
/**
* @author freva
@@ -41,37 +39,31 @@ public class MaintainerTest {
@Test
public void success_metric() {
- ManualClock clock = new ManualClock();
- AtomicReference<Instant> lastSuccess = new AtomicReference<>();
- JobMetrics jobMetrics = new JobMetrics(clock, (job, instant) -> lastSuccess.set(instant));
+ AtomicLong consecutiveFailures = new AtomicLong();
+ JobMetrics jobMetrics = new JobMetrics((job, count) -> consecutiveFailures.set(count));
TestMaintainer maintainer = new TestMaintainer(jobMetrics);
- // Maintainer not successful yet
+ // Maintainer fails twice in a row
maintainer.successOnNextRun(false).run();
- assertNull(lastSuccess.get());
+ assertEquals(1, consecutiveFailures.get());
+ maintainer.successOnNextRun(false).run();
+ assertEquals(2, consecutiveFailures.get());
// Maintainer runs successfully
- clock.advance(Duration.ofHours(1));
- Instant lastSuccess0 = clock.instant();
maintainer.successOnNextRun(true).run();
- assertEquals(lastSuccess0, lastSuccess.get());
+ assertEquals(0, consecutiveFailures.get());
// Maintainer runs successfully again
- clock.advance(Duration.ofHours(2));
- Instant lastSuccess1 = clock.instant();
maintainer.run();
- assertEquals(lastSuccess1, lastSuccess.get());
+ assertEquals(0, consecutiveFailures.get());
// Maintainer throws
- clock.advance(Duration.ofHours(5));
maintainer.throwOnNextRun(true).run();
- assertEquals("Time of successful run is unchanged", lastSuccess1, lastSuccess.get());
+ assertEquals(1, consecutiveFailures.get());
// Maintainer recovers
- clock.advance(Duration.ofHours(3));
- Instant lastSuccess2 = clock.instant();
maintainer.throwOnNextRun(false).run();
- assertEquals(lastSuccess2, lastSuccess.get());
+ assertEquals(0, consecutiveFailures.get());
}
}
diff --git a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java
index 0ea24fb6c2b..5eae643fe40 100644
--- a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java
+++ b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java
@@ -1,7 +1,6 @@
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.concurrent.maintenance;
-import java.time.Clock;
import java.time.Duration;
/**
@@ -22,7 +21,7 @@ class TestMaintainer extends Maintainer {
}
public TestMaintainer(String name, JobControl jobControl) {
- this(name, jobControl, new JobMetrics(Clock.systemUTC(), (job, instant) -> {}));
+ this(name, jobControl, new JobMetrics((job, instant) -> {}));
}
public int totalRuns() {