summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2020-07-20 16:48:42 +0200
committerMartin Polden <mpolden@mpolden.no>2020-07-21 10:13:52 +0200
commitaa5768c42fd854c9466baf06d70867bec4531298 (patch)
tree7afc13388bfa7b9d0a91924895c04ecd124df09f /node-repository
parentbea398a2638d7b1071a2889da771d9fb72ad91d4 (diff)
Measure consecutive maintenance failures
Measuring time since last success results in a wide range of acceptable values, due to maintenance intervals varying from seconds to as long as half a day. Measure consecutive failures instead, to simplify alerting thresholds.
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java12
1 files changed, 5 insertions, 7 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java
index 85477dad729..5f87cf9fd9b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java
@@ -9,7 +9,6 @@ import com.yahoo.jdisc.Metric;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
-import java.time.Clock;
import java.time.Duration;
import java.util.List;
import java.util.Map;
@@ -25,8 +24,8 @@ public abstract class NodeRepositoryMaintainer extends Maintainer {
private final NodeRepository nodeRepository;
public NodeRepositoryMaintainer(NodeRepository nodeRepository, Duration interval, Metric metric) {
- super(null, interval, nodeRepository.clock().instant(), nodeRepository.jobControl(),
- jobMetrics(nodeRepository.clock(), metric), nodeRepository.database().cluster());
+ super(null, interval, nodeRepository.clock().instant(), nodeRepository.jobControl(), jobMetrics(metric),
+ nodeRepository.database().cluster());
this.nodeRepository = nodeRepository;
}
@@ -45,10 +44,9 @@ public abstract class NodeRepositoryMaintainer extends Maintainer {
.collect(Collectors.groupingBy(node -> node.allocation().get().owner()));
}
- private static JobMetrics jobMetrics(Clock clock, Metric metric) {
- return new JobMetrics(clock, (job, instant) -> {
- Duration sinceSuccess = Duration.between(instant, clock.instant());
- metric.set("maintenance.secondsSinceSuccess", sinceSuccess.getSeconds(), metric.createContext(Map.of("job", job)));
+ private static JobMetrics jobMetrics(Metric metric) {
+ return new JobMetrics((job, consecutiveFailures) -> {
+ metric.set("maintenance.consecutiveFailures", consecutiveFailures, metric.createContext(Map.of("job", job)));
});
}