diff options
author | Bjørn Meland <bjormel@users.noreply.github.com> | 2020-07-21 11:08:37 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-07-21 11:08:37 +0200 |
commit | f4640a992be45eceba4b7eebf1b1eeb6d03fe39d (patch) | |
tree | e7f4b6d7b3051900f52b9ee5e0827ccb65fc869a | |
parent | cb81d2e33535db9e45c461630319c41dbf79ff0d (diff) | |
parent | aa5768c42fd854c9466baf06d70867bec4531298 (diff) |
Merge pull request #13928 from vespa-engine/mpolden/measure-consecutive-failures
Measure consecutive maintenance failures
8 files changed, 72 insertions, 66 deletions
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintainer.java b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintainer.java index 007ca8dcf53..5854b1d85da 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintainer.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintainer.java @@ -14,7 +14,6 @@ import com.yahoo.vespa.flags.FlagSource; import com.yahoo.vespa.flags.Flags; import com.yahoo.vespa.flags.ListFlag; -import java.time.Clock; import java.time.Duration; import java.util.Map; import java.util.Set; @@ -31,14 +30,13 @@ public abstract class ConfigServerMaintainer extends Maintainer { ConfigServerMaintainer(ApplicationRepository applicationRepository, Curator curator, FlagSource flagSource, Duration initialDelay, Duration interval) { super(null, interval, initialDelay, new JobControl(new JobControlFlags(curator, flagSource)), - jobMetrics(applicationRepository.clock(), applicationRepository.metric())); + jobMetrics(applicationRepository.metric())); this.applicationRepository = applicationRepository; } - private static JobMetrics jobMetrics(Clock clock, Metric metric) { - return new JobMetrics(clock, (job, instant) -> { - Duration sinceSuccess = Duration.between(instant, clock.instant()); - metric.set("maintenance.secondsSinceSuccess", sinceSuccess.getSeconds(), metric.createContext(Map.of("job", job))); + private static JobMetrics jobMetrics(Metric metric) { + return new JobMetrics((job, consecutiveFailures) -> { + metric.set("maintenance.consecutiveFailures", consecutiveFailures, metric.createContext(Map.of("job", job))); }); } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java index 76003a873fe..9bf6352813a 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java @@ -7,7 +7,6 @@ import com.yahoo.config.provision.SystemName; import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.controller.Controller; -import java.time.Clock; import java.time.Duration; import java.util.EnumSet; import java.util.Map; @@ -35,7 +34,7 @@ public abstract class ControllerMaintainer extends Maintainer { public ControllerMaintainer(Controller controller, Duration interval, String name, Set<SystemName> activeSystems) { super(name, interval, controller.clock().instant(), controller.jobControl(), - jobMetrics(controller.clock(), controller.metric()), controller.curator().cluster()); + jobMetrics(controller.metric()), controller.curator().cluster()); this.controller = controller; this.activeSystems = Set.copyOf(Objects.requireNonNull(activeSystems)); } @@ -48,10 +47,9 @@ public abstract class ControllerMaintainer extends Maintainer { super.run(); } - private static JobMetrics jobMetrics(Clock clock, Metric metric) { - return new JobMetrics(clock, (job, instant) -> { - Duration sinceSuccess = Duration.between(instant, clock.instant()); - metric.set("maintenance.secondsSinceSuccess", sinceSuccess.getSeconds(), metric.createContext(Map.of("job", job))); + private static JobMetrics jobMetrics(Metric metric) { + return new JobMetrics((job, consecutiveFailures) -> { + metric.set("maintenance.consecutiveFailures", consecutiveFailures, metric.createContext(Map.of("job", job))); }); } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainerTest.java index 4218e66703f..6a2feba1d47 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainerTest.java @@ -2,6 +2,7 @@ package com.yahoo.vespa.hosted.controller.maintenance; import com.yahoo.config.provision.SystemName; +import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.ControllerTester; import com.yahoo.vespa.hosted.controller.integration.MetricsMock; import org.junit.Before; @@ -28,28 +29,47 @@ public class ControllerMaintainerTest { @Test public void only_runs_in_permitted_systems() { AtomicInteger executions = new AtomicInteger(); - maintainerIn(SystemName.cd, executions).run(); - maintainerIn(SystemName.main, executions).run(); + new TestControllerMaintainer(tester.controller(), SystemName.cd, executions).run(); + new TestControllerMaintainer(tester.controller(), SystemName.main, executions).run(); assertEquals(1, executions.get()); } @Test public void records_metric() { - maintainerIn(SystemName.main, new AtomicInteger()).run(); + TestControllerMaintainer maintainer = new TestControllerMaintainer(tester.controller(), SystemName.main, new AtomicInteger()); + maintainer.run(); + assertEquals(0L, consecutiveFailuresMetric()); + maintainer.success = false; + maintainer.run(); + maintainer.run(); + assertEquals(2L, consecutiveFailuresMetric()); + maintainer.success = true; + maintainer.run();; + assertEquals(0, consecutiveFailuresMetric()); + } + + private long consecutiveFailuresMetric() { MetricsMock metrics = (MetricsMock) tester.controller().metric(); - assertEquals(0L, metrics.getMetric((context) -> "MockMaintainer".equals(context.get("job")), - "maintenance.secondsSinceSuccess").get()); + return metrics.getMetric((context) -> "TestControllerMaintainer".equals(context.get("job")), + "maintenance.consecutiveFailures").get().longValue(); } - private ControllerMaintainer maintainerIn(SystemName system, AtomicInteger executions) { - return new ControllerMaintainer(tester.controller(), Duration.ofDays(1), - "MockMaintainer", EnumSet.of(system)) { - @Override - protected boolean maintain() { - executions.incrementAndGet(); - return true; - } - }; + private static class TestControllerMaintainer extends ControllerMaintainer { + + private final AtomicInteger executions; + private boolean success = true; + + public TestControllerMaintainer(Controller controller, SystemName system, AtomicInteger executions) { + super(controller, Duration.ofDays(1), null, EnumSet.of(system)); + this.executions = executions; + } + + @Override + protected boolean maintain() { + executions.incrementAndGet(); + return success; + } + } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java index 85477dad729..5f87cf9fd9b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java @@ -9,7 +9,6 @@ import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; -import java.time.Clock; import java.time.Duration; import java.util.List; import java.util.Map; @@ -25,8 +24,8 @@ public abstract class NodeRepositoryMaintainer extends Maintainer { private final NodeRepository nodeRepository; public NodeRepositoryMaintainer(NodeRepository nodeRepository, Duration interval, Metric metric) { - super(null, interval, nodeRepository.clock().instant(), nodeRepository.jobControl(), - jobMetrics(nodeRepository.clock(), metric), nodeRepository.database().cluster()); + super(null, interval, nodeRepository.clock().instant(), nodeRepository.jobControl(), jobMetrics(metric), + nodeRepository.database().cluster()); this.nodeRepository = nodeRepository; } @@ -45,10 +44,9 @@ public abstract class NodeRepositoryMaintainer extends Maintainer { .collect(Collectors.groupingBy(node -> node.allocation().get().owner())); } - private static JobMetrics jobMetrics(Clock clock, Metric metric) { - return new JobMetrics(clock, (job, instant) -> { - Duration sinceSuccess = Duration.between(instant, clock.instant()); - metric.set("maintenance.secondsSinceSuccess", sinceSuccess.getSeconds(), metric.createContext(Map.of("job", job))); + private static JobMetrics jobMetrics(Metric metric) { + return new JobMetrics((job, consecutiveFailures) -> { + metric.set("maintenance.consecutiveFailures", consecutiveFailures, metric.createContext(Map.of("job", job))); }); } diff --git a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java index 4c05d46d782..a43e2156025 100644 --- a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java +++ b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java @@ -1,10 +1,7 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.concurrent.maintenance; -import java.time.Clock; -import java.time.Instant; import java.util.Map; -import java.util.Objects; import java.util.concurrent.ConcurrentHashMap; import java.util.function.BiConsumer; @@ -15,26 +12,29 @@ import java.util.function.BiConsumer; */ public class JobMetrics { - private final Clock clock; - private final BiConsumer<String, Instant> metricConsumer; + private final BiConsumer<String, Long> metricConsumer; - private final Map<String, Instant> successfulRuns = new ConcurrentHashMap<>(); + private final Map<String, Long> incompleteRuns = new ConcurrentHashMap<>(); - public JobMetrics(Clock clock, BiConsumer<String, Instant> metricConsumer) { - this.clock = Objects.requireNonNull(clock); + public JobMetrics(BiConsumer<String, Long> metricConsumer) { this.metricConsumer = metricConsumer; } + /** Record a run for given job */ + public void recordRunOf(String job) { + incompleteRuns.compute(job, (ignored, run) -> run == null ? 1 : ++run); + } + /** Record successful run of given job */ public void recordSuccessOf(String job) { - successfulRuns.put(job, clock.instant()); + incompleteRuns.put(job, 0L); } /** Forward metrics for given job to metric consumer */ public void forward(String job) { - Instant lastSuccess = successfulRuns.get(job); - if (lastSuccess != null) { - metricConsumer.accept(job, lastSuccess); + Long incompleteRuns = this.incompleteRuns.get(job); + if (incompleteRuns != null) { + metricConsumer.accept(job, incompleteRuns); } } diff --git a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java index 0385c27536d..eb9b91c812c 100644 --- a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java +++ b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java @@ -85,6 +85,7 @@ public abstract class Maintainer implements Runnable, AutoCloseable { public final void lockAndMaintain() { try (var lock = jobControl.lockJob(name())) { try { + jobMetrics.recordRunOf(name()); if (maintain()) jobMetrics.recordSuccessOf(name()); } finally { // Always forward metrics diff --git a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java index 47ed010e95e..2bfaad894a5 100644 --- a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java +++ b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java @@ -1,16 +1,14 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.concurrent.maintenance; -import com.yahoo.test.ManualClock; import org.junit.Test; import java.time.Duration; import java.time.Instant; import java.util.List; -import java.util.concurrent.atomic.AtomicReference; +import java.util.concurrent.atomic.AtomicLong; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNull; /** * @author freva @@ -41,37 +39,31 @@ public class MaintainerTest { @Test public void success_metric() { - ManualClock clock = new ManualClock(); - AtomicReference<Instant> lastSuccess = new AtomicReference<>(); - JobMetrics jobMetrics = new JobMetrics(clock, (job, instant) -> lastSuccess.set(instant)); + AtomicLong consecutiveFailures = new AtomicLong(); + JobMetrics jobMetrics = new JobMetrics((job, count) -> consecutiveFailures.set(count)); TestMaintainer maintainer = new TestMaintainer(jobMetrics); - // Maintainer not successful yet + // Maintainer fails twice in a row maintainer.successOnNextRun(false).run(); - assertNull(lastSuccess.get()); + assertEquals(1, consecutiveFailures.get()); + maintainer.successOnNextRun(false).run(); + assertEquals(2, consecutiveFailures.get()); // Maintainer runs successfully - clock.advance(Duration.ofHours(1)); - Instant lastSuccess0 = clock.instant(); maintainer.successOnNextRun(true).run(); - assertEquals(lastSuccess0, lastSuccess.get()); + assertEquals(0, consecutiveFailures.get()); // Maintainer runs successfully again - clock.advance(Duration.ofHours(2)); - Instant lastSuccess1 = clock.instant(); maintainer.run(); - assertEquals(lastSuccess1, lastSuccess.get()); + assertEquals(0, consecutiveFailures.get()); // Maintainer throws - clock.advance(Duration.ofHours(5)); maintainer.throwOnNextRun(true).run(); - assertEquals("Time of successful run is unchanged", lastSuccess1, lastSuccess.get()); + assertEquals(1, consecutiveFailures.get()); // Maintainer recovers - clock.advance(Duration.ofHours(3)); - Instant lastSuccess2 = clock.instant(); maintainer.throwOnNextRun(false).run(); - assertEquals(lastSuccess2, lastSuccess.get()); + assertEquals(0, consecutiveFailures.get()); } } diff --git a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java index 0ea24fb6c2b..5eae643fe40 100644 --- a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java +++ b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java @@ -1,7 +1,6 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.concurrent.maintenance; -import java.time.Clock; import java.time.Duration; /** @@ -22,7 +21,7 @@ class TestMaintainer extends Maintainer { } public TestMaintainer(String name, JobControl jobControl) { - this(name, jobControl, new JobMetrics(Clock.systemUTC(), (job, instant) -> {})); + this(name, jobControl, new JobMetrics((job, instant) -> {})); } public int totalRuns() { |