diff options
author | Jon Bratseth <bratseth@oath.com> | 2021-03-15 11:11:51 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-03-15 11:11:51 +0100 |
commit | d17f07156b04e8a96784abcbf6ef0a1628935d03 (patch) | |
tree | c476dc4c65e26ad5f7c05542b0761ea5be4b0e3c | |
parent | 49412d97fcf8e05972d3961e51d00b83580a7ea1 (diff) | |
parent | 0756cef71ab86012504dd60eb776b8221785c15b (diff) |
Merge pull request #16945 from vespa-engine/bratseth/traffic-share-updater-robustness
Continue on failures
2 files changed, 22 insertions, 8 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java index 5938925e9bc..fbe9faa9754 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java @@ -9,6 +9,7 @@ import com.yahoo.vespa.hosted.controller.api.integration.configserver.NodeReposi import com.yahoo.vespa.hosted.controller.application.Deployment; import java.time.Duration; +import java.util.logging.Level; /** * This computes, for every application deployment @@ -36,18 +37,29 @@ public class TrafficShareUpdater extends ControllerMaintainer { @Override protected boolean maintain() { + boolean success = false; + Exception lastException = null; for (var application : applications.asList()) { for (var instance : application.instances().values()) { for (var deployment : instance.deployments().values()) { if ( ! deployment.zone().environment().isProduction()) continue; - updateTrafficFraction(instance, deployment); + + try { + success |= updateTrafficFraction(instance, deployment); + } + catch (Exception e) { + // Some failures due to locked applications are expected and benign + lastException = e; + } } } } - return true; + if ( ! success && lastException != null) // log on complete failure + log.log(Level.WARNING, "Could not update traffic share on any applications", lastException); + return success; } - private void updateTrafficFraction(Instance instance, Deployment deployment) { + private boolean updateTrafficFraction(Instance instance, Deployment deployment) { double qpsInZone = deployment.metrics().queriesPerSecond(); double totalQps = instance.deployments().values().stream() .filter(i -> i.zone().environment().isProduction()) @@ -61,6 +73,7 @@ public class TrafficShareUpdater extends ControllerMaintainer { maxReadShare = currentReadShare; // distribution can be incorrect nodeRepository.patchApplication(deployment.zone(), instance.id(), currentReadShare, maxReadShare); + return true; } } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java index 2674e155b98..2f24d3e6eee 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java @@ -15,6 +15,7 @@ import org.junit.Test; import java.time.Duration; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; /** * Tests the traffic fraction updater. This also tests its dependency on DeploymentMetricsMaintainer. @@ -37,7 +38,7 @@ public class TrafficShareUpdaterTest { // Single zone setQpsMetric(50.0, application.application().id().defaultInstance(), prod1, tester); deploymentMetricsMaintainer.maintain(); - updater.maintain(); + assertTrue(updater.maintain()); assertTrafficFraction(1.0, 1.0, application.instanceId(), prod1, tester); // Two zones @@ -46,14 +47,14 @@ public class TrafficShareUpdaterTest { setQpsMetric(50.0, application.application().id().defaultInstance(), prod1, tester); setQpsMetric(0.0, application.application().id().defaultInstance(), prod2, tester); deploymentMetricsMaintainer.maintain(); - updater.maintain(); + assertTrue(updater.maintain()); assertTrafficFraction(1.0, 1.0, application.instanceId(), prod1, tester); assertTrafficFraction(0.0, 1.0, application.instanceId(), prod2, tester); // - both hot setQpsMetric(53.0, application.application().id().defaultInstance(), prod1, tester); setQpsMetric(47.0, application.application().id().defaultInstance(), prod2, tester); deploymentMetricsMaintainer.maintain(); - updater.maintain(); + assertTrue(updater.maintain()); assertTrafficFraction(0.53, 1.0, application.instanceId(), prod1, tester); assertTrafficFraction(0.47, 1.0, application.instanceId(), prod2, tester); @@ -64,7 +65,7 @@ public class TrafficShareUpdaterTest { setQpsMetric(47.0, application.application().id().defaultInstance(), prod2, tester); setQpsMetric(0.0, application.application().id().defaultInstance(), prod3, tester); deploymentMetricsMaintainer.maintain(); - updater.maintain(); + assertTrue(updater.maintain()); assertTrafficFraction(0.53, 0.53, application.instanceId(), prod1, tester); assertTrafficFraction(0.47, 0.50, application.instanceId(), prod2, tester); assertTrafficFraction(0.00, 0.50, application.instanceId(), prod3, tester); @@ -73,7 +74,7 @@ public class TrafficShareUpdaterTest { setQpsMetric(25.0, application.application().id().defaultInstance(), prod2, tester); setQpsMetric(25.0, application.application().id().defaultInstance(), prod3, tester); deploymentMetricsMaintainer.maintain(); - updater.maintain(); + assertTrue(updater.maintain()); assertTrafficFraction(0.50, 0.5, application.instanceId(), prod1, tester); assertTrafficFraction(0.25, 0.5, application.instanceId(), prod2, tester); assertTrafficFraction(0.25, 0.5, application.instanceId(), prod3, tester); |