diff options
author | Jon Bratseth <bratseth@gmail.com> | 2023-02-07 11:30:35 +0100 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2023-02-07 11:45:12 +0100 |
commit | 4e77b912b2c85ef9a8d8fc3ba7e849699ce3a801 (patch) | |
tree | 4c1288f029eb8ef0d84a3b1a8b62d8c3ffae9904 /controller-server | |
parent | 7499956ca4dcc7c7ae6a003a73e921b4b7b4fae1 (diff) |
Support configuring BCP structure
Diffstat (limited to 'controller-server')
3 files changed, 263 insertions, 53 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java index 832dbb6b921..aea01ae36d3 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java @@ -1,6 +1,10 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.maintenance; +import com.yahoo.config.application.api.Bcp; +import com.yahoo.config.application.api.DeploymentSpec; +import com.yahoo.config.provision.InstanceName; +import com.yahoo.config.provision.RegionName; import com.yahoo.vespa.hosted.controller.ApplicationController; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.Instance; @@ -8,7 +12,11 @@ import com.yahoo.vespa.hosted.controller.api.integration.configserver.NodeReposi import com.yahoo.vespa.hosted.controller.application.Deployment; import java.time.Duration; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; import java.util.logging.Level; +import java.util.stream.Collectors; /** * This computes, for every application deployment @@ -41,12 +49,11 @@ public class TrafficShareUpdater extends ControllerMaintainer { int failures = 0; for (var application : applications.asList()) { for (var instance : application.instances().values()) { - for (var deployment : instance.deployments().values()) { - if ( ! deployment.zone().environment().isProduction()) continue; + for (var deployment : instance.productionDeployments().values()) { if (shuttingDown()) return 1.0; try { attempts++; - updateTrafficFraction(instance, deployment); + updateTrafficFraction(instance, deployment, application.deploymentSpec()); } catch (Exception e) { // Some failures due to locked applications are expected and benign @@ -62,20 +69,94 @@ public class TrafficShareUpdater extends ControllerMaintainer { return successFactor; } - private void updateTrafficFraction(Instance instance, Deployment deployment) { - double qpsInZone = deployment.metrics().queriesPerSecond(); - double totalQps = instance.deployments().values().stream() - .filter(i -> i.zone().environment().isProduction()) - .mapToDouble(i -> i.metrics().queriesPerSecond()).sum(); - long prodRegions = instance.deployments().values().stream() - .filter(i -> i.zone().environment().isProduction()) - .count(); - double currentReadShare = totalQps == 0 ? 0 : qpsInZone / totalQps; - double maxReadShare = prodRegions < 2 ? 1.0 : 1.0 / ( prodRegions - 1.0); - if (currentReadShare > maxReadShare) // This can happen because the assumption of equal traffic - maxReadShare = currentReadShare; // distribution can be incorrect + private void updateTrafficFraction(Instance instance, Deployment deployment, DeploymentSpec deploymentSpec) { + // maxReadShare / currentReadShare = how much additional traffic must the zone be able to handle + double currentReadShare = 0; // How much of the total traffic of the group(s) this is a member of does this deployment receive + double maxReadShare = 0; // How much of the total traffic of the group(s) this is a member of might this deployment receive if a member of the group fails + for (BcpGroup group : BcpGroup.groupsFrom(instance, deploymentSpec)) { + if ( ! group.contains(deployment.zone().region())) continue; + double deploymentQps = deployment.metrics().queriesPerSecond(); + double groupQps = group.totalQps(); + double fraction = group.fraction(deployment.zone().region()); + currentReadShare += groupQps == 0 ? 0 : fraction * deploymentQps / groupQps; + maxReadShare += group.size() == 1 + ? currentReadShare + : fraction * ( deploymentQps + group.maxQpsExcluding(deployment.zone().region()) / (group.size() - 1) ) / groupQps; + } nodeRepository.patchApplication(deployment.zone(), instance.id(), currentReadShare, maxReadShare); } + /** + * A set of regions which will take over traffic from each other if one of them fails. + * Each region will take an equal share (modulated by fraction) of the failing region's traffic. + * + * A regions membership in a group may be partial, represented by a fraction [0, 1], + * in which case the other regions will collectively only take that fraction of the failing regions traffic, + * and symmetrically, the region will only take its fraction of its share of traffic of any other failing region. + */ + private static class BcpGroup { + + /** The instance which has this group. */ + private final Instance instance; + + /** Regions in this group, with their fractions. */ + private final Map<RegionName, Double> regions; + + /** Creates a group of a subset of the deployments in this instance. */ + private BcpGroup(Instance instance, Map<RegionName, Double> regions) { + this.instance = instance; + this.regions = regions; + } + + /** Returns the sum of the fractional memberships of this. */ + double size() { + return regions.values().stream().mapToDouble(f -> f).sum(); + } + + double fraction(RegionName region) { + return regions.getOrDefault(region, 0.0); + } + + boolean contains(RegionName region) { + return regions.containsKey(region); + } + + double totalQps() { + return instance.productionDeployments().values().stream() + .mapToDouble(i -> i.metrics().queriesPerSecond()).sum(); + } + + double maxQpsExcluding(RegionName region) { + return instance.productionDeployments().values().stream() + .filter(d -> ! d.zone().region().equals(region)) + .mapToDouble(d -> d.metrics().queriesPerSecond() * fraction(d.zone().region())) + .max() + .orElse(0); + } + private static Bcp bcpOf(InstanceName instanceName, DeploymentSpec deploymentSpec) { + var instanceSpec = deploymentSpec.instance(instanceName); + if (instanceSpec.isEmpty()) return deploymentSpec.bcp(); + return instanceSpec.get().bcp().orElse(deploymentSpec.bcp()); + } + + private static Map<RegionName, Double> regionsFrom(Instance instance) { + return instance.productionDeployments().values().stream() + .collect(Collectors.toMap(deployment -> deployment.zone().region(), __ -> 1.0)); + } + + private static Map<RegionName, Double> regionsFrom(Bcp.Group groupSpec) { + return groupSpec.members().stream() + .collect(Collectors.toMap(member -> member.region(), member -> member.fraction())); + } + + static List<BcpGroup> groupsFrom(Instance instance, DeploymentSpec deploymentSpec) { + Bcp bcp = bcpOf(instance.name(), deploymentSpec); + if (bcp.isEmpty()) + return List.of(new BcpGroup(instance, regionsFrom(instance))); + return bcp.groups().stream().map(groupSpec -> new BcpGroup(instance, regionsFrom(groupSpec))).toList(); + } + + } + } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java index 14835a822e6..b712746b663 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java @@ -15,6 +15,7 @@ import com.yahoo.security.KeyUtils; import com.yahoo.security.SignatureAlgorithm; import com.yahoo.security.X509CertificateBuilder; import com.yahoo.vespa.hosted.controller.Application; +import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.ControllerTester; import com.yahoo.vespa.hosted.controller.Instance; import com.yahoo.vespa.hosted.controller.api.identifiers.DeploymentId; @@ -52,6 +53,7 @@ import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Optional; +import java.util.OptionalLong; import java.util.Set; import java.util.concurrent.atomic.AtomicLong; import java.util.function.Supplier; @@ -196,8 +198,9 @@ public class DeploymentContext { Application application = application(); assertTrue(application.revisions().last().isPresent(), "Application package submitted"); assertFalse(application.instances().values().stream() - .anyMatch(instance -> instance.deployments().values().stream() - .anyMatch(deployment -> deployment.revision().equals(lastSubmission))), "Submission is not already deployed"); + .anyMatch(instance -> instance.deployments().values().stream() + .anyMatch(deployment -> deployment.revision().equals(lastSubmission))), + "Submission is not already deployed"); completeRollout(application.deploymentSpec().instances().size() > 1); for (var instance : application().instances().values()) { assertFalse(instance.change().hasTargets()); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java index fad85ef9b48..5c26e270846 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java @@ -2,8 +2,10 @@ package com.yahoo.vespa.hosted.controller.maintenance; import com.yahoo.component.Version; +import com.yahoo.config.application.api.xml.DeploymentSpecXmlReader; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.zone.ZoneId; +import com.yahoo.vespa.hosted.controller.Application; import com.yahoo.vespa.hosted.controller.api.application.v4.model.ClusterMetrics; import com.yahoo.vespa.hosted.controller.api.identifiers.DeploymentId; import com.yahoo.vespa.hosted.controller.application.pkg.ApplicationPackage; @@ -14,6 +16,7 @@ import org.junit.jupiter.api.Test; import java.time.Duration; import java.util.Map; +import java.util.OptionalLong; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -25,61 +28,184 @@ import static org.junit.jupiter.api.Assertions.assertEquals; public class TrafficShareUpdaterTest { @Test - void testTrafficUpdater() { + void testTrafficUpdaterImplicitBcp() { DeploymentTester tester = new DeploymentTester(); Version version = Version.fromString("7.1"); - tester.controllerTester().upgradeSystem(version); - var application = tester.newDeploymentContext(); + tester.controllerTester().upgradeSystem(Version.fromString("7.1")); + var context = tester.newDeploymentContext(); var deploymentMetricsMaintainer = new DeploymentMetricsMaintainer(tester.controller(), Duration.ofDays(1)); var updater = new TrafficShareUpdater(tester.controller(), Duration.ofDays(1)); ZoneId prod1 = ZoneId.from("prod", "ap-northeast-1"); ZoneId prod2 = ZoneId.from("prod", "us-east-3"); ZoneId prod3 = ZoneId.from("prod", "us-west-1"); - application.runJob(DeploymentContext.productionApNortheast1, new ApplicationPackage(new byte[0]), version); + context.runJob(DeploymentContext.perfUsEast3, new ApplicationPackage(new byte[0]), version); // Ignored + context.runJob(DeploymentContext.productionApNortheast1, new ApplicationPackage(new byte[0]), version); - // Single zone - setQpsMetric(50.0, application.application().id().defaultInstance(), prod1, tester); + // One zone + context.runJob(DeploymentContext.productionApNortheast1, new ApplicationPackage(new byte[0]), version); + setQpsMetric(50.0, context.application().id().defaultInstance(), prod1, tester); deploymentMetricsMaintainer.maintain(); assertEquals(1.0, updater.maintain(), 0.0000001); - assertTrafficFraction(1.0, 1.0, application.instanceId(), prod1, tester); + assertTrafficFraction(1.0, 1.0, context.instanceId(), prod1, tester); // Two zones - application.runJob(DeploymentContext.productionUsEast3, new ApplicationPackage(new byte[0]), version); - // - one cold - setQpsMetric(50.0, application.application().id().defaultInstance(), prod1, tester); - setQpsMetric(0.0, application.application().id().defaultInstance(), prod2, tester); + context.runJob(DeploymentContext.productionUsEast3, new ApplicationPackage(new byte[0]), version); + setQpsMetric(60.0, context.application().id().defaultInstance(), prod1, tester); + setQpsMetric(20.0, context.application().id().defaultInstance(), prod2, tester); deploymentMetricsMaintainer.maintain(); assertEquals(1.0, updater.maintain(), 0.0000001); - assertTrafficFraction(1.0, 1.0, application.instanceId(), prod1, tester); - assertTrafficFraction(0.0, 1.0, application.instanceId(), prod2, tester); - // - both hot - setQpsMetric(53.0, application.application().id().defaultInstance(), prod1, tester); - setQpsMetric(47.0, application.application().id().defaultInstance(), prod2, tester); + assertTrafficFraction(0.75, 1.0, context.instanceId(), prod1, tester); + assertTrafficFraction(0.25, 1.0, context.instanceId(), prod2, tester); + + // Three zones + context.runJob(DeploymentContext.productionUsWest1, new ApplicationPackage(new byte[0]), version); + setQpsMetric(53.0, context.application().id().defaultInstance(), prod1, tester); + setQpsMetric(45.0, context.application().id().defaultInstance(), prod2, tester); + setQpsMetric(02.0, context.application().id().defaultInstance(), prod3, tester); deploymentMetricsMaintainer.maintain(); assertEquals(1.0, updater.maintain(), 0.0000001); - assertTrafficFraction(0.53, 1.0, application.instanceId(), prod1, tester); - assertTrafficFraction(0.47, 1.0, application.instanceId(), prod2, tester); + assertTrafficFraction(0.53, 0.53 + (double)45/2 / 100, context.instanceId(), prod1, tester); + assertTrafficFraction(0.45, 0.45 + (double)53/2 / 100, context.instanceId(), prod2, tester); + assertTrafficFraction(0.02, 0.02 + (double)53/2 / 100, context.instanceId(), prod3, tester); + } + + @Test + void testTrafficUpdaterHotCold() { + var spec = """ + <deployment version="1.0"> + <staging/> + <prod> + <region>ap-northeast-1</region> + <region>ap-southeast-1</region> + <region>us-east-3</region> + <region>us-central-1</region> + <region>eu-west-1</region> + </prod> + <bcp> + <group> + <region>ap-northeast-1</region> + <region>ap-southeast-1</region> + </group> + <group> + <region>us-east-3</region> + <region>us-central-1</region> + </group> + <group> + <region>eu-west-1</region> + </group> + </bcp> + </deployment> + """; + + DeploymentTester tester = new DeploymentTester(); + Version version = Version.fromString("7.1"); + tester.controllerTester().upgradeSystem(Version.fromString("7.1")); + var context = tester.newDeploymentContext(); + var deploymentSpec = new DeploymentSpecXmlReader(true).read(spec); + tester.controller().applications() + .lockApplicationOrThrow(context.application().id(), + locked -> tester.controller().applications().store(locked.with(deploymentSpec))); + + var deploymentMetricsMaintainer = new DeploymentMetricsMaintainer(tester.controller(), Duration.ofDays(1)); + var updater = new TrafficShareUpdater(tester.controller(), Duration.ofDays(1)); + + ZoneId ap1 = ZoneId.from("prod", "ap-northeast-1"); + ZoneId ap2 = ZoneId.from("prod", "ap-southeast-1"); + ZoneId us1 = ZoneId.from("prod", "us-east-3"); + ZoneId us2 = ZoneId.from("prod", "us-central-1"); + ZoneId eu1 = ZoneId.from("prod", "eu-west-1"); + + context.runJob(DeploymentContext.productionApNortheast1, new ApplicationPackage(new byte[0]), version); + context.runJob(DeploymentContext.productionApSoutheast1, new ApplicationPackage(new byte[0]), version); + context.runJob(DeploymentContext.productionUsEast3, new ApplicationPackage(new byte[0]), version); + context.runJob(DeploymentContext.productionUsCentral1, new ApplicationPackage(new byte[0]), version); + context.runJob(DeploymentContext.productionEuWest1, new ApplicationPackage(new byte[0]), version); + + setQpsMetric(50.0, context.application().id().defaultInstance(), ap1, tester); + setQpsMetric(00.0, context.application().id().defaultInstance(), ap2, tester); + setQpsMetric(10.0, context.application().id().defaultInstance(), us1, tester); + setQpsMetric(00.0, context.application().id().defaultInstance(), us2, tester); + setQpsMetric(40.0, context.application().id().defaultInstance(), eu1, tester); - // Three zones - application.runJob(DeploymentContext.productionUsWest1, new ApplicationPackage(new byte[0]), version); - // - one cold - setQpsMetric(53.0, application.application().id().defaultInstance(), prod1, tester); - setQpsMetric(47.0, application.application().id().defaultInstance(), prod2, tester); - setQpsMetric(0.0, application.application().id().defaultInstance(), prod3, tester); deploymentMetricsMaintainer.maintain(); assertEquals(1.0, updater.maintain(), 0.0000001); - assertTrafficFraction(0.53, 0.53, application.instanceId(), prod1, tester); - assertTrafficFraction(0.47, 0.50, application.instanceId(), prod2, tester); - assertTrafficFraction(0.00, 0.50, application.instanceId(), prod3, tester); - // - all hot - setQpsMetric(50.0, application.application().id().defaultInstance(), prod1, tester); - setQpsMetric(25.0, application.application().id().defaultInstance(), prod2, tester); - setQpsMetric(25.0, application.application().id().defaultInstance(), prod3, tester); + assertTrafficFraction(0.5, 0.5, context.instanceId(), ap1, tester); + assertTrafficFraction(0.0, 0.5, context.instanceId(), ap2, tester); + assertTrafficFraction(0.1, 0.1, context.instanceId(), us1, tester); + assertTrafficFraction(0.0, 0.1, context.instanceId(), us2, tester); + assertTrafficFraction(0.4, 0.4, context.instanceId(), eu1, tester); + } + + @Test + void testTrafficUpdaterOverlappingGroups() { + var spec = """ + <deployment version="1.0"> + <staging/> + <prod> + <region>ap-northeast-1</region> + <region>ap-southeast-1</region> + <region>us-east-3</region> + <region>us-central-1</region> + <region>us-west-1</region> + <region>eu-west-1</region> + </prod> + <bcp> + <group> + <region>ap-northeast-1</region> + <region>ap-southeast-1</region> + <region fraction="0.5">eu-west-1</region> + </group> + <group> + <region>us-east-3</region> + <region>us-central-1</region> + <region>us-west-1</region> + <region fraction="0.5">eu-west-1</region> + </group> + </bcp> + </deployment> + """; + + DeploymentTester tester = new DeploymentTester(); + Version version = Version.fromString("7.1"); + tester.controllerTester().upgradeSystem(Version.fromString("7.1")); + var context = tester.newDeploymentContext(); + var deploymentSpec = new DeploymentSpecXmlReader(true).read(spec); + tester.controller().applications() + .lockApplicationOrThrow(context.application().id(), + locked -> tester.controller().applications().store(locked.with(deploymentSpec))); + + var deploymentMetricsMaintainer = new DeploymentMetricsMaintainer(tester.controller(), Duration.ofDays(1)); + var updater = new TrafficShareUpdater(tester.controller(), Duration.ofDays(1)); + + ZoneId ap1 = ZoneId.from("prod", "ap-northeast-1"); + ZoneId ap2 = ZoneId.from("prod", "ap-southeast-1"); + ZoneId us1 = ZoneId.from("prod", "us-east-3"); + ZoneId us2 = ZoneId.from("prod", "us-central-1"); + ZoneId us3 = ZoneId.from("prod", "us-west-1"); + ZoneId eu1 = ZoneId.from("prod", "eu-west-1"); + + context.runJob(DeploymentContext.productionApNortheast1, new ApplicationPackage(new byte[0]), version); + context.runJob(DeploymentContext.productionApSoutheast1, new ApplicationPackage(new byte[0]), version); + context.runJob(DeploymentContext.productionUsEast3, new ApplicationPackage(new byte[0]), version); + context.runJob(DeploymentContext.productionUsCentral1, new ApplicationPackage(new byte[0]), version); + context.runJob(DeploymentContext.productionUsWest1, new ApplicationPackage(new byte[0]), version); + context.runJob(DeploymentContext.productionEuWest1, new ApplicationPackage(new byte[0]), version); + + setQpsMetric(20.0, context.application().id().defaultInstance(), ap1, tester); + setQpsMetric(50.0, context.application().id().defaultInstance(), ap2, tester); + setQpsMetric(00.0, context.application().id().defaultInstance(), us1, tester); + setQpsMetric(30.0, context.application().id().defaultInstance(), us2, tester); + setQpsMetric(40.0, context.application().id().defaultInstance(), us3, tester); + setQpsMetric(60.0, context.application().id().defaultInstance(), eu1, tester); + deploymentMetricsMaintainer.maintain(); assertEquals(1.0, updater.maintain(), 0.0000001); - assertTrafficFraction(0.50, 0.5, application.instanceId(), prod1, tester); - assertTrafficFraction(0.25, 0.5, application.instanceId(), prod2, tester); - assertTrafficFraction(0.25, 0.5, application.instanceId(), prod3, tester); + assertTrafficFraction(0.10, 0.10 + 50 / 200.0 / 1.5, context.instanceId(), ap1, tester); + assertTrafficFraction(0.25, 0.25 + 30 / 200.0 / 1.5, context.instanceId(), ap2, tester); + assertTrafficFraction(0.00, 0.00 + 40 / 200.0 / 2.5, context.instanceId(), us1, tester); + assertTrafficFraction(0.15, 0.15 + 40 / 200.0 / 2.5, context.instanceId(), us2, tester); + assertTrafficFraction(0.20, 0.20 + 30 / 200.0 / 2.5, context.instanceId(), us3, tester); + assertTrafficFraction(0.30, 0.30 + 0.5 * 50 / 200.0 / 1.5 + 0.5 * 40 / 200.0 / 2.5, context.instanceId(), eu1, tester); } private void setQpsMetric(double qps, ApplicationId application, ZoneId zone, DeploymentTester tester) { @@ -90,8 +216,8 @@ public class TrafficShareUpdaterTest { private void assertTrafficFraction(double currentReadShare, double maxReadShare, ApplicationId application, ZoneId zone, DeploymentTester tester) { NodeRepositoryMock mock = (NodeRepositoryMock)tester.controller().serviceRegistry().configServer().nodeRepository(); - assertEquals(currentReadShare, mock.getTrafficFraction(application, zone).getFirst(), 0.00001); - assertEquals(maxReadShare, mock.getTrafficFraction(application, zone).getSecond(), 0.00001); + assertEquals(currentReadShare, mock.getTrafficFraction(application, zone).getFirst(), 0.00001, "Current read share"); + assertEquals(maxReadShare, mock.getTrafficFraction(application, zone).getSecond(), 0.00001, "Max read share"); } } |