diff options
author | Jon Marius Venstad <jonmv@users.noreply.github.com> | 2017-10-17 16:51:56 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-10-17 16:51:56 +0200 |
commit | 8d475f58081a43f27dc349135a53e54b2c479921 (patch) | |
tree | 7e5a44a216ba312ddf5ebef70213d499b89df12a /controller-server | |
parent | e630d2258e15118cd5884ab5525d242e3a0c8f84 (diff) | |
parent | 080389c6153107c7debeaa1ad197795afb9c7635 (diff) |
Merge pull request #3790 from vespa-engine/jvenstad/catch-metrics-fetching-errors-and-reduce-locking
Ignore timeouts (retried later anyway) and reduce locking time
Diffstat (limited to 'controller-server')
-rw-r--r-- | controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java | 33 |
1 files changed, 25 insertions, 8 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java index d9ef451ffb7..7a771464957 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java @@ -7,7 +7,9 @@ import com.yahoo.vespa.hosted.controller.api.integration.MetricsService; import com.yahoo.vespa.hosted.controller.application.Deployment; import com.yahoo.vespa.hosted.controller.application.DeploymentMetrics; +import java.io.UncheckedIOException; import java.time.Duration; +import java.util.logging.Logger; /** * Retrieve deployment metrics like qps and document count from the metric service and @@ -17,6 +19,8 @@ import java.time.Duration; */ public class DeploymentMetricsMaintainer extends Maintainer { + private static final Logger log = Logger.getLogger(DeploymentMetricsMaintainer.class.getName()); + DeploymentMetricsMaintainer(Controller controller, Duration duration, JobControl jobControl) { super(controller, duration, jobControl); } @@ -25,19 +29,32 @@ public class DeploymentMetricsMaintainer extends Maintainer { protected void maintain() { for (Application application : controller().applications().asList()) { - try (Lock lock = controller().applications().lock(application.id())) { - for (Deployment deployment : application.deployments().values()) { - + for (Deployment deployment : application.deployments().values()) { + try { MetricsService.DeploymentMetrics metrics = controller().metricsService() .getDeploymentMetrics(application.id(), deployment.zone()); - DeploymentMetrics appMetrics = new DeploymentMetrics(metrics.queriesPerSecond(), metrics.writesPerSecond(), - metrics.documentCount(), metrics.queryLatencyMillis(), metrics.writeLatencyMillis()); - - Application app = application.with(deployment.withMetrics(appMetrics)); - controller().applications().store(app, lock); + metrics.documentCount(), metrics.queryLatencyMillis(), metrics.writeLatencyMillis()); + + // Avoid locking for a long time, due to slow YAMAS. + try (Lock lock = controller().applications().lock(application.id())) { + // Deployment (or application) may have changed (or be gone) now: + controller().applications().get(application.id()).ifPresent(freshApplication -> { + Deployment freshDeployment = freshApplication.deployments().get(deployment.zone()); + if (freshDeployment != null) + controller().applications().store(freshApplication.with(freshDeployment.withMetrics(appMetrics)), lock); + }); + } + } + catch (UncheckedIOException e) { + log.warning("Timed out talking to YAMAS; retrying in " + maintenanceInterval() + ":\n" + e); } + } + } + } + } + |