diff options
author | Martin Polden <mpolden@mpolden.no> | 2020-07-20 16:48:42 +0200 |
---|---|---|
committer | Martin Polden <mpolden@mpolden.no> | 2020-07-21 10:13:52 +0200 |
commit | aa5768c42fd854c9466baf06d70867bec4531298 (patch) | |
tree | 7afc13388bfa7b9d0a91924895c04ecd124df09f /controller-server/src/main | |
parent | bea398a2638d7b1071a2889da771d9fb72ad91d4 (diff) |
Measure consecutive maintenance failures
Measuring time since last success results in a wide range of acceptable values,
due to maintenance intervals varying from seconds to as long as half a day.
Measure consecutive failures instead, to simplify alerting thresholds.
Diffstat (limited to 'controller-server/src/main')
-rw-r--r-- | controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java | 10 |
1 files changed, 4 insertions, 6 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java index 76003a873fe..9bf6352813a 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintainer.java @@ -7,7 +7,6 @@ import com.yahoo.config.provision.SystemName; import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.controller.Controller; -import java.time.Clock; import java.time.Duration; import java.util.EnumSet; import java.util.Map; @@ -35,7 +34,7 @@ public abstract class ControllerMaintainer extends Maintainer { public ControllerMaintainer(Controller controller, Duration interval, String name, Set<SystemName> activeSystems) { super(name, interval, controller.clock().instant(), controller.jobControl(), - jobMetrics(controller.clock(), controller.metric()), controller.curator().cluster()); + jobMetrics(controller.metric()), controller.curator().cluster()); this.controller = controller; this.activeSystems = Set.copyOf(Objects.requireNonNull(activeSystems)); } @@ -48,10 +47,9 @@ public abstract class ControllerMaintainer extends Maintainer { super.run(); } - private static JobMetrics jobMetrics(Clock clock, Metric metric) { - return new JobMetrics(clock, (job, instant) -> { - Duration sinceSuccess = Duration.between(instant, clock.instant()); - metric.set("maintenance.secondsSinceSuccess", sinceSuccess.getSeconds(), metric.createContext(Map.of("job", job))); + private static JobMetrics jobMetrics(Metric metric) { + return new JobMetrics((job, consecutiveFailures) -> { + metric.set("maintenance.consecutiveFailures", consecutiveFailures, metric.createContext(Map.of("job", job))); }); } |