summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Marius Venstad <jonmv@users.noreply.github.com>2017-10-17 16:51:56 +0200
committerGitHub <noreply@github.com>2017-10-17 16:51:56 +0200
commit8d475f58081a43f27dc349135a53e54b2c479921 (patch)
tree7e5a44a216ba312ddf5ebef70213d499b89df12a
parente630d2258e15118cd5884ab5525d242e3a0c8f84 (diff)
parent080389c6153107c7debeaa1ad197795afb9c7635 (diff)
Merge pull request #3790 from vespa-engine/jvenstad/catch-metrics-fetching-errors-and-reduce-locking
Ignore timeouts (retried later anyway) and reduce locking time
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java33
1 files changed, 25 insertions, 8 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java
index d9ef451ffb7..7a771464957 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java
@@ -7,7 +7,9 @@ import com.yahoo.vespa.hosted.controller.api.integration.MetricsService;
import com.yahoo.vespa.hosted.controller.application.Deployment;
import com.yahoo.vespa.hosted.controller.application.DeploymentMetrics;
+import java.io.UncheckedIOException;
import java.time.Duration;
+import java.util.logging.Logger;
/**
* Retrieve deployment metrics like qps and document count from the metric service and
@@ -17,6 +19,8 @@ import java.time.Duration;
*/
public class DeploymentMetricsMaintainer extends Maintainer {
+ private static final Logger log = Logger.getLogger(DeploymentMetricsMaintainer.class.getName());
+
DeploymentMetricsMaintainer(Controller controller, Duration duration, JobControl jobControl) {
super(controller, duration, jobControl);
}
@@ -25,19 +29,32 @@ public class DeploymentMetricsMaintainer extends Maintainer {
protected void maintain() {
for (Application application : controller().applications().asList()) {
- try (Lock lock = controller().applications().lock(application.id())) {
- for (Deployment deployment : application.deployments().values()) {
-
+ for (Deployment deployment : application.deployments().values()) {
+ try {
MetricsService.DeploymentMetrics metrics = controller().metricsService()
.getDeploymentMetrics(application.id(), deployment.zone());
-
DeploymentMetrics appMetrics = new DeploymentMetrics(metrics.queriesPerSecond(), metrics.writesPerSecond(),
- metrics.documentCount(), metrics.queryLatencyMillis(), metrics.writeLatencyMillis());
-
- Application app = application.with(deployment.withMetrics(appMetrics));
- controller().applications().store(app, lock);
+ metrics.documentCount(), metrics.queryLatencyMillis(), metrics.writeLatencyMillis());
+
+ // Avoid locking for a long time, due to slow YAMAS.
+ try (Lock lock = controller().applications().lock(application.id())) {
+ // Deployment (or application) may have changed (or be gone) now:
+ controller().applications().get(application.id()).ifPresent(freshApplication -> {
+ Deployment freshDeployment = freshApplication.deployments().get(deployment.zone());
+ if (freshDeployment != null)
+ controller().applications().store(freshApplication.with(freshDeployment.withMetrics(appMetrics)), lock);
+ });
+ }
+ }
+ catch (UncheckedIOException e) {
+ log.warning("Timed out talking to YAMAS; retrying in " + maintenanceInterval() + ":\n" + e);
}
+
}
+
}
+
}
+
}
+