diff options
10 files changed, 753 insertions, 77 deletions
diff --git a/config-model-api/src/main/java/com/yahoo/config/application/api/Bcp.java b/config-model-api/src/main/java/com/yahoo/config/application/api/Bcp.java new file mode 100644 index 00000000000..af369dc2672 --- /dev/null +++ b/config-model-api/src/main/java/com/yahoo/config/application/api/Bcp.java @@ -0,0 +1,122 @@ +package com.yahoo.config.application.api; + +import com.yahoo.config.provision.RegionName; + +import java.time.Duration; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Defines the BCP structure for an instance in a deployment spec: + * A list of region groups where each group contains a set of regions + * which will handle the traffic of a member in the group when it becomes unreachable. + * + * This is used to make bcp-aware autoscaling decisions. If no explicit BCP spec + * is provided, it is assumed that a regions traffic will be divided equally over all + * the other regions when it becomes unreachable - i.e a single BCP group is implicitly + * defined having all defined production regions as members with fraction 1.0. + * + * It is assumed that the traffic of the unreachable region is distributed + * evenly to the other members of the group. + * + * A region can be a fractional member of a group, in which case it is assumed that + * region will only handle that fraction of its share of the unreachable regions traffic, + * and symmetrically that the other members of the group will only handle that fraction + * of the fraction regions traffic if it becomes unreachable. + * + * Each production region defined in the instance must have fractional memberships in groups that sums to exactly one. + * + * If a group has one member it will not set aside any capacity for BCP. + * If a group has more than two members, the system will attempt to provision capacity + * for BCP also when a region is unreachable. That is, if there are three member regions, A, B and C, + * each handling 100 qps, then they each aim to handle 150 in case one goes down. If C goes down, + * A and B will now handle 150 each, but will each aim to handle 300 each in case the other goes down. + * + * @author bratseth + */ +public class Bcp { + + private static final Bcp empty = new Bcp(List.of()); + + private final List<Group> groups; + + public Bcp(List<Group> groups) { + totalMembershipSumsToOne(groups); + this.groups = List.copyOf(groups); + } + + public List<Group> groups() { return groups; } + + /** Returns the set of regions declared in the groups of this. */ + public Set<RegionName> regions() { + return groups.stream().flatMap(group -> group.members().stream()).map(member -> member.region()).collect(Collectors.toSet()); + } + + public boolean isEmpty() { return groups.isEmpty(); } + + /** Returns this bcp spec, or if it is empty, the given bcp spec. */ + public Bcp orElse(Bcp other) { + return this.isEmpty() ? other : this; + } + + private void totalMembershipSumsToOne(List<Group> groups) { + Map<RegionName, Double> totalMembership = new HashMap<>(); + for (var group : groups) { + for (var member : group.members()) + totalMembership.compute(member.region(), (__, fraction) -> fraction == null ? member.fraction() + : fraction + member.fraction()); + } + for (var entry : totalMembership.entrySet()) { + if (entry.getValue() != 1.0) + throw new IllegalArgumentException("Illegal BCP spec: All regions must have total membership fractions summing to 1.0, but " + + entry.getKey() + " sums to " + entry.getValue()); + } + } + + public static Bcp empty() { return empty; } + + @Override + public String toString() { + if (isEmpty()) return "empty BCP"; + return "BCP of " + groups; + } + + public static class Group { + + private final Duration deadline; + private final List<RegionMember> members; + + public Group(List<RegionMember> members, Duration deadline) { + this.members = List.copyOf(members); + this.deadline = deadline; + } + + public List<RegionMember> members() { return members; } + + /** + * Returns the max time until the other regions must be able to handle the additional traffic + * when a region becomes unreachable, which by default is Duration.ZERO. + */ + public Duration deadline() { return deadline; } + + @Override + public String toString() { + return "BCP group of " + members; + } + + } + + public record RegionMember(RegionName region, double fraction) { + + public RegionMember { + if (fraction < 0 || fraction > 1) + throw new IllegalArgumentException("Fraction must be a number between 0.0 and 1.0, but got " + fraction); + } + + + } + +} diff --git a/config-model-api/src/main/java/com/yahoo/config/application/api/DeploymentInstanceSpec.java b/config-model-api/src/main/java/com/yahoo/config/application/api/DeploymentInstanceSpec.java index b36c1409459..4b30734365d 100644 --- a/config-model-api/src/main/java/com/yahoo/config/application/api/DeploymentInstanceSpec.java +++ b/config-model-api/src/main/java/com/yahoo/config/application/api/DeploymentInstanceSpec.java @@ -61,6 +61,7 @@ public class DeploymentInstanceSpec extends DeploymentSpec.Steps { private final Notifications notifications; private final List<Endpoint> endpoints; private final Map<ClusterSpec.Id, Map<ZoneId, ZoneEndpoint>> zoneEndpoints; + private final Bcp bcp; public DeploymentInstanceSpec(InstanceName name, Tags tags, @@ -77,6 +78,7 @@ public class DeploymentInstanceSpec extends DeploymentSpec.Steps { Notifications notifications, List<Endpoint> endpoints, Map<ClusterSpec.Id, Map<ZoneId, ZoneEndpoint>> zoneEndpoints, + Bcp bcp, Instant now) { super(steps); this.name = Objects.requireNonNull(name); @@ -101,8 +103,9 @@ public class DeploymentInstanceSpec extends DeploymentSpec.Steps { Map<ClusterSpec.Id, Map<ZoneId, ZoneEndpoint>> zoneEndpointsCopy = new HashMap<>(); for (var entry : zoneEndpoints.entrySet()) zoneEndpointsCopy.put(entry.getKey(), Collections.unmodifiableMap(new HashMap<>(entry.getValue()))); this.zoneEndpoints = Collections.unmodifiableMap(zoneEndpointsCopy); + this.bcp = Objects.requireNonNull(bcp); validateZones(new HashSet<>(), new HashSet<>(), this); - validateEndpoints(steps(), globalServiceId, this.endpoints); + validateEndpoints(globalServiceId, this.endpoints); validateChangeBlockers(changeBlockers, now); } @@ -144,25 +147,41 @@ public class DeploymentInstanceSpec extends DeploymentSpec.Steps { } /** Throw an IllegalArgumentException if an endpoint refers to a region that is not declared in 'prod' */ - private void validateEndpoints(List<DeploymentSpec.Step> steps, Optional<String> globalServiceId, List<Endpoint> endpoints) { + private void validateEndpoints(Optional<String> globalServiceId, List<Endpoint> endpoints) { if (globalServiceId.isPresent() && ! endpoints.isEmpty()) { throw new IllegalArgumentException("Providing both 'endpoints' and 'global-service-id'. Use only 'endpoints'."); } - var stepZones = steps.stream() - .flatMap(s -> s.zones().stream()) - .flatMap(z -> z.region().stream()) - .collect(Collectors.toSet()); - + var regions = prodRegions(); for (var endpoint : endpoints){ for (var endpointRegion : endpoint.regions()) { - if (! stepZones.contains(endpointRegion)) { + if (! regions.contains(endpointRegion)) { throw new IllegalArgumentException("Region used in endpoint that is not declared in 'prod': " + endpointRegion); } } } } + /** Validates the given BCP instance (which is owned by this, or if none, a default) against this instance. */ + void validateBcp(Bcp bcp) { + if (bcp.isEmpty()) return; + if ( ! prodRegions().equals(bcp.regions())) + throw new IllegalArgumentException("BCP and deployment mismatch in " + this + ": " + + "A <bcp> element must place all deployed production regions in " + + "at least one group, and declare no extra regions. " + + "Deployed regions: " + prodRegions() + + ". BCP regions: " + bcp.regions()); +} + /** Returns the production regions the steps of this specifies a deployment to. */ + private Set<RegionName> prodRegions() { + return steps().stream() + .flatMap(s -> s.zones().stream()) + .filter(zone -> zone.environment().isProduction()) + .flatMap(z -> z.region().stream()) + .collect(Collectors.toSet()); + } + + private void validateChangeBlockers(List<DeploymentSpec.ChangeBlocker> changeBlockers, Instant now) { // Find all possible dates an upgrade block window can start Stream<Instant> blockingFrom = changeBlockers.stream() @@ -256,6 +275,9 @@ public class DeploymentInstanceSpec extends DeploymentSpec.Steps { /** Returns the rotations configuration of these instances */ public List<Endpoint> endpoints() { return endpoints; } + /** Returns the BCP spec declared in this specified instance, or BcpSpec.empty() if none. */ + public Bcp bcp() { return bcp; } + /** Returns whether this instance deploys to the given zone, either implicitly or explicitly */ public boolean deploysTo(Environment environment, RegionName region) { return zones().stream().anyMatch(zone -> zone.concerns(environment, Optional.of(region))); diff --git a/config-model-api/src/main/java/com/yahoo/config/application/api/DeploymentSpec.java b/config-model-api/src/main/java/com/yahoo/config/application/api/DeploymentSpec.java index cbdb5bd6bcc..0e9841fe5be 100644 --- a/config-model-api/src/main/java/com/yahoo/config/application/api/DeploymentSpec.java +++ b/config-model-api/src/main/java/com/yahoo/config/application/api/DeploymentSpec.java @@ -18,9 +18,7 @@ import java.time.Duration; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; -import java.util.HashMap; import java.util.List; -import java.util.Map; import java.util.Objects; import java.util.Optional; import java.util.stream.Collectors; @@ -47,6 +45,7 @@ public class DeploymentSpec { Optional.empty(), Optional.empty(), List.of(), + Bcp.empty(), "<deployment version='1.0'/>", List.of()); @@ -58,6 +57,7 @@ public class DeploymentSpec { private final Optional<AthenzService> athenzService; private final Optional<CloudAccount> cloudAccount; private final List<Endpoint> endpoints; + private final Bcp bcp; private final List<DeprecatedElement> deprecatedElements; private final String xmlForm; @@ -68,6 +68,7 @@ public class DeploymentSpec { Optional<AthenzService> athenzService, Optional<CloudAccount> cloudAccount, List<Endpoint> endpoints, + Bcp bcp, String xmlForm, List<DeprecatedElement> deprecatedElements) { this.steps = List.copyOf(Objects.requireNonNull(steps)); @@ -77,11 +78,13 @@ public class DeploymentSpec { this.cloudAccount = Objects.requireNonNull(cloudAccount); this.xmlForm = Objects.requireNonNull(xmlForm); this.endpoints = List.copyOf(Objects.requireNonNull(endpoints)); + this.bcp = Objects.requireNonNull(bcp); this.deprecatedElements = List.copyOf(Objects.requireNonNull(deprecatedElements)); validateTotalDelay(steps); validateUpgradePoliciesOfIncreasingConservativeness(steps); validateAthenz(); validateApplicationEndpoints(); + validateBcp(); } /** Throw an IllegalArgumentException if the total delay exceeds 24 hours */ @@ -158,13 +161,16 @@ public class DeploymentSpec { } } + private void validateBcp() { + for (var instance : instances()) + instance.validateBcp(instance.bcp().orElse(bcp())); + } + /** Returns the major version this application is pinned to, or empty (default) to allow all major versions */ public Optional<Integer> majorVersion() { return majorVersion; } /** Returns the deployment steps of this in the order they will be performed */ - public List<Step> steps() { - return steps; - } + public List<Step> steps() { return steps; } /** Returns the Athenz domain set on the root tag, if any */ public Optional<AthenzDomain> athenzDomain() { return athenzDomain; } @@ -203,6 +209,9 @@ public class DeploymentSpec { .orElse(ZoneEndpoint.defaultEndpoint); } + /** Returns the default BCP spec for instances, or Bcp.empty() if none are defined. */ + public Bcp bcp() { return bcp; } + /** Returns the XML form of this spec, or null if it was not created by fromXml, nor is empty */ public String xmlForm() { return xmlForm; } diff --git a/config-model-api/src/main/java/com/yahoo/config/application/api/xml/DeploymentSpecXmlReader.java b/config-model-api/src/main/java/com/yahoo/config/application/api/xml/DeploymentSpecXmlReader.java index fb6d834f783..be6be5566a8 100644 --- a/config-model-api/src/main/java/com/yahoo/config/application/api/xml/DeploymentSpecXmlReader.java +++ b/config-model-api/src/main/java/com/yahoo/config/application/api/xml/DeploymentSpecXmlReader.java @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.config.application.api.xml; +import com.yahoo.config.application.api.Bcp; import com.yahoo.config.application.api.DeploymentInstanceSpec; import com.yahoo.config.application.api.DeploymentSpec; import com.yahoo.config.application.api.DeploymentSpec.DeclaredTest; @@ -46,7 +47,6 @@ import java.time.Instant; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; -import java.util.Comparator; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.LinkedHashSet; @@ -54,6 +54,7 @@ import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Optional; +import java.util.OptionalDouble; import java.util.Set; import java.util.function.Function; import java.util.stream.Stream; @@ -161,6 +162,7 @@ public class DeploymentSpecXmlReader { stringAttribute(athenzServiceAttribute, root).map(AthenzService::from), stringAttribute(cloudAccountAttribute, root).map(CloudAccount::from), applicationEndpoints, + readBcp(root), xmlForm, deprecatedElements); } @@ -228,6 +230,7 @@ public class DeploymentSpecXmlReader { notifications, endpoints, zoneEndpoints, + readBcp(instanceElement), now)) .toList(); } @@ -455,6 +458,24 @@ public class DeploymentSpecXmlReader { validateAndConsolidate(endpointsByZone, zoneEndpoints); } + static Bcp readBcp(Element element) { + Element bcpElement = XML.getChild(element, "bcp"); + if (bcpElement == null) return Bcp.empty(); + + List<Bcp.Group> groups = new ArrayList<>(); + for (Element groupElement : XML.getChildren(bcpElement, "group")) { + List<Bcp.RegionMember> regions = new ArrayList<>(); + for (Element regionElement : XML.getChildren(groupElement, "region")) { + RegionName region = RegionName.from(XML.getValue(regionElement)); + double fraction = toDouble(XML.attribute("fraction", regionElement).orElse(null), "fraction").orElse(1.0); + regions.add(new Bcp.RegionMember(region, fraction)); + } + Duration deadline = XML.attribute("deadline", groupElement).map(value -> toDuration(value, "deadline")).orElse(Duration.ZERO); + groups.add(new Bcp.Group(regions, deadline)); + } + return new Bcp(groups); + } + static void validateAndConsolidate(Map<String, Map<RegionName, List<ZoneEndpoint>>> in, Map<ClusterSpec.Id, Map<ZoneId, ZoneEndpoint>> out) { in.forEach((cluster, regions) -> { List<ZoneEndpoint> wildcards = regions.remove(null); @@ -713,6 +734,39 @@ public class DeploymentSpecXmlReader { .findFirst(); } + /** + * Returns a string consisting of a number followed by "m" or "M" to a duration of that number of minutes, + * or zero duration if null of blank. + */ + private static Duration toDuration(String minutesSpec, String sourceDescription) { + try { + if (minutesSpec == null || minutesSpec.isBlank()) return Duration.ZERO; + minutesSpec = minutesSpec.trim().toLowerCase(); + if ( ! minutesSpec.endsWith("m")) + throw new IllegalArgumentException("Must end by 'm'"); + try { + return Duration.ofMinutes(Integer.parseInt(minutesSpec.substring(0, minutesSpec.length() - 1))); + } + catch (NumberFormatException e) { + throw new IllegalArgumentException("Must be an integer number of minutes followed by 'm'"); + } + } + catch (IllegalArgumentException e) { + throw new IllegalArgumentException("Illegal " + sourceDescription + " '" + minutesSpec + "'", e); + } + } + + private static OptionalDouble toDouble(String value, String sourceDescription) { + try { + if (value == null || value.isBlank()) return OptionalDouble.empty(); + return OptionalDouble.of(Double.parseDouble(value)); + } + catch (NumberFormatException e) { + throw new IllegalArgumentException("Illegal " + sourceDescription + " '" + value + "': " + + "Must be a number between 0.0 and 1.0"); + } + } + private static void illegal(String message) { throw new IllegalArgumentException(message); } diff --git a/config-model-api/src/test/java/com/yahoo/config/application/api/DeploymentSpecTest.java b/config-model-api/src/test/java/com/yahoo/config/application/api/DeploymentSpecTest.java index 2e746ff55c8..afa7e3e502b 100644 --- a/config-model-api/src/test/java/com/yahoo/config/application/api/DeploymentSpecTest.java +++ b/config-model-api/src/test/java/com/yahoo/config/application/api/DeploymentSpecTest.java @@ -106,16 +106,16 @@ public class DeploymentSpecTest { @Test public void minimalProductionSpec() { - StringReader r = new StringReader( - "<deployment version='1.0'>" + - " <instance id='default'>" + - " <prod>" + - " <region active='false'>us-east1</region>" + - " <region active='true'>us-west1</region>" + - " </prod>" + - " </instance>" + - "</deployment>" - ); + StringReader r = new StringReader( """ + <deployment version='1.0'> + <instance id='default'> + <prod> + <region active='false'>us-east1</region> + <region active='true'>us-west1</region> + </prod> + </instance> + </deployment> + """); DeploymentSpec spec = DeploymentSpec.fromXml(r); assertEquals(1, spec.steps().size()); diff --git a/config-model-api/src/test/java/com/yahoo/config/application/api/DeploymentSpecWithBcpTest.java b/config-model-api/src/test/java/com/yahoo/config/application/api/DeploymentSpecWithBcpTest.java new file mode 100644 index 00000000000..77aadc88be8 --- /dev/null +++ b/config-model-api/src/test/java/com/yahoo/config/application/api/DeploymentSpecWithBcpTest.java @@ -0,0 +1,254 @@ +package com.yahoo.config.application.api; + +import com.yahoo.config.provision.RegionName; +import com.yahoo.yolean.Exceptions; +import org.junit.Test; + +import java.io.StringReader; +import java.time.Duration; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +/** + * @author bratseth + */ +public class DeploymentSpecWithBcpTest { + + @Test + public void minimalProductionSpecWithExplicitBcp() { + StringReader r = new StringReader(""" + <deployment version='1.0'> + <instance id='default'> + <prod> + <region>us-east1</region> + <region>us-west1</region> + </prod> + </instance> + <bcp> + <group> + <region>us-east1</region> + <region>us-west1</region> + </group> + </bcp> + </deployment> + """); + assertTwoRegions(DeploymentSpec.fromXml(r)); + } + + @Test + public void specWithoutInstanceWithBcp() { + StringReader r = new StringReader(""" + <deployment version='1.0'> + <prod> + <region>us-east1</region> + <region>us-west1</region> + </prod> + <bcp> + <group> + <region>us-east1</region> + <region>us-west1</region> + </group> + </bcp> + </deployment> + """); + assertTwoRegions(DeploymentSpec.fromXml(r)); + } + + @Test + public void complexBcpSetup() { + StringReader r = new StringReader(""" + <deployment version='1.0'> + <instance id='beta'> + <prod> + <region>us-east1</region> + <region>us-east2</region> + </prod> + <bcp> + <group deadline="60m"> + <region>us-east1</region> + <region>us-east2</region> + </group> + </bcp> + </instance> + <instance id='main'> + <prod> + <region>us-east1</region> + <region>us-east2</region> + <region>us-central1</region> + <region>us-west1</region> + <region>us-west2</region> + <region>eu-east1</region> + <region>eu-west1</region> + </prod> + </instance> + <bcp> + <group> + <region>us-east1</region> + <region>us-east2</region> + <region fraction="0.3">us-central1</region> + </group> + <group> + <region>us-west1</region> + <region>us-west2</region> + <region fraction="0.7">us-central1</region> + </group> + <group deadline="30m"> + <region>eu-east1</region> + <region>eu-west1</region> + </group> + </bcp> + </deployment> + """); + var spec = DeploymentSpec.fromXml(r); + var betaBcp = spec.requireInstance("beta").bcp().orElse(spec.bcp()); + assertEquals(1, betaBcp.groups().size()); + var betaGroup = betaBcp.groups().get(0); + assertEquals(2, betaGroup.members().size()); + assertEquals(Duration.ofMinutes(60), betaGroup.deadline()); + assertEquals(new Bcp.RegionMember(RegionName.from("us-east1"), 1.0), betaGroup.members().get(0)); + assertEquals(new Bcp.RegionMember(RegionName.from("us-east2"), 1.0), betaGroup.members().get(1)); + + var mainBcp = spec.requireInstance("main").bcp().orElse(spec.bcp()); + assertEquals(7, mainBcp.regions().size()); + assertEquals(3, mainBcp.groups().size()); + + var usEast = mainBcp.groups().get(0); + assertEquals(3, usEast.members().size()); + assertEquals(Duration.ofMinutes(0), usEast.deadline()); + assertEquals(new Bcp.RegionMember(RegionName.from("us-east1"), 1.0), usEast.members().get(0)); + assertEquals(new Bcp.RegionMember(RegionName.from("us-east2"), 1.0), usEast.members().get(1)); + assertEquals(new Bcp.RegionMember(RegionName.from("us-central1"), 0.3), usEast.members().get(2)); + + var usWest = mainBcp.groups().get(1); + assertEquals(3, usWest.members().size()); + assertEquals(Duration.ofMinutes(0), usWest.deadline()); + assertEquals(new Bcp.RegionMember(RegionName.from("us-west1"), 1.0), usWest.members().get(0)); + assertEquals(new Bcp.RegionMember(RegionName.from("us-west2"), 1.0), usWest.members().get(1)); + assertEquals(new Bcp.RegionMember(RegionName.from("us-central1"), 0.7), usWest.members().get(2)); + + var eu = mainBcp.groups().get(2); + assertEquals(2, eu.members().size()); + assertEquals(Duration.ofMinutes(30), eu.deadline()); + assertEquals(new Bcp.RegionMember(RegionName.from("eu-east1"), 1.0), eu.members().get(0)); + assertEquals(new Bcp.RegionMember(RegionName.from("eu-west1"), 1.0), eu.members().get(1)); + } + + @Test + public void regionMembershipMatchValidation1() { + try { + StringReader r = new StringReader(""" + <deployment version='1.0'> + <prod> + <region>us-east1</region> + <region>us-west1</region> + </prod> + <bcp> + <group> + <region>us-west1</region> + </group> + </bcp> + </deployment> + """); + DeploymentSpec.fromXml(r); + fail(); + } + catch (IllegalArgumentException e) { + assertEquals("BCP and deployment mismatch in instance 'default': " + + "A <bcp> element must place all deployed production regions in at least one group, " + + "and declare no extra regions. " + + "Deployed regions: [us-east1, us-west1]. BCP regions: [us-west1]", + Exceptions.toMessageString(e)); + } + } + + @Test + public void regionMembershipMatchValidation2() { + try { + StringReader r = new StringReader(""" + <deployment version='1.0'> + <prod> + <region>us-west1</region> + </prod> + <bcp> + <group> + <region>us-east1</region> + <region>us-west1</region> + </group> + </bcp> + </deployment> + """); + DeploymentSpec.fromXml(r); + fail(); + } + catch (IllegalArgumentException e) { + assertEquals("BCP and deployment mismatch in instance 'default': " + + "A <bcp> element must place all deployed production regions in at least one group, " + + "and declare no extra regions. " + + "Deployed regions: [us-west1]. BCP regions: [us-east1, us-west1]", + Exceptions.toMessageString(e)); + } + } + + @Test + public void deadlineValidation() { + try { + StringReader r = new StringReader(""" + <deployment version='1.0'> + <prod> + <region>us-east1</region> + <region>us-west1</region> + </prod> + <bcp> + <group deadline="fast"> + <region>us-east1</region> + <region>us-west1</region> + </group> + </bcp> + </deployment> + """); + DeploymentSpec.fromXml(r); + fail(); + } + catch (IllegalArgumentException e) { + assertEquals("Illegal deadline 'fast': Must end by 'm'", Exceptions.toMessageString(e)); + } + } + + @Test + public void fractionalMembershipValidation() { + try { + StringReader r = new StringReader(""" + <deployment version='1.0'> + <prod> + <region>us-east1</region> + <region>us-west1</region> + </prod> + <bcp> + <group> + <region fraction="0.9">us-east1</region> + <region>us-west1</region> + </group> + </bcp> + </deployment> + """); + DeploymentSpec.fromXml(r); + fail(); + } + catch (IllegalArgumentException e) { + assertEquals("Illegal BCP spec: All regions must have total membership fractions summing to 1.0, but us-east1 sums to 0.9", + Exceptions.toMessageString(e)); + } + } + + private void assertTwoRegions(DeploymentSpec spec) { + var bcp = spec.requireInstance("default").bcp().orElse(spec.bcp()); + assertEquals(1, bcp.groups().size()); + var group = bcp.groups().get(0); + assertEquals(2, group.members().size()); + assertEquals(Duration.ZERO, group.deadline()); + assertEquals(new Bcp.RegionMember(RegionName.from("us-east1"), 1.0), group.members().get(0)); + assertEquals(new Bcp.RegionMember(RegionName.from("us-west1"), 1.0), group.members().get(1)); + } + +} diff --git a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java index bc9448f39d8..33cd2f48d46 100644 --- a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java +++ b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java @@ -241,6 +241,11 @@ public class YqlParserTestCase { } @Test + void testNonEquality() { + assertParse("select foo from bar where !(price = 500)", "-price:500"); + } + + @Test void testNegativeLessThan() { assertParse("select foo from bar where price < -500", "price:<-500"); assertParse("select foo from bar where -500 < price", "price:>-500"); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java index 832dbb6b921..aea01ae36d3 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java @@ -1,6 +1,10 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.maintenance; +import com.yahoo.config.application.api.Bcp; +import com.yahoo.config.application.api.DeploymentSpec; +import com.yahoo.config.provision.InstanceName; +import com.yahoo.config.provision.RegionName; import com.yahoo.vespa.hosted.controller.ApplicationController; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.Instance; @@ -8,7 +12,11 @@ import com.yahoo.vespa.hosted.controller.api.integration.configserver.NodeReposi import com.yahoo.vespa.hosted.controller.application.Deployment; import java.time.Duration; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; import java.util.logging.Level; +import java.util.stream.Collectors; /** * This computes, for every application deployment @@ -41,12 +49,11 @@ public class TrafficShareUpdater extends ControllerMaintainer { int failures = 0; for (var application : applications.asList()) { for (var instance : application.instances().values()) { - for (var deployment : instance.deployments().values()) { - if ( ! deployment.zone().environment().isProduction()) continue; + for (var deployment : instance.productionDeployments().values()) { if (shuttingDown()) return 1.0; try { attempts++; - updateTrafficFraction(instance, deployment); + updateTrafficFraction(instance, deployment, application.deploymentSpec()); } catch (Exception e) { // Some failures due to locked applications are expected and benign @@ -62,20 +69,94 @@ public class TrafficShareUpdater extends ControllerMaintainer { return successFactor; } - private void updateTrafficFraction(Instance instance, Deployment deployment) { - double qpsInZone = deployment.metrics().queriesPerSecond(); - double totalQps = instance.deployments().values().stream() - .filter(i -> i.zone().environment().isProduction()) - .mapToDouble(i -> i.metrics().queriesPerSecond()).sum(); - long prodRegions = instance.deployments().values().stream() - .filter(i -> i.zone().environment().isProduction()) - .count(); - double currentReadShare = totalQps == 0 ? 0 : qpsInZone / totalQps; - double maxReadShare = prodRegions < 2 ? 1.0 : 1.0 / ( prodRegions - 1.0); - if (currentReadShare > maxReadShare) // This can happen because the assumption of equal traffic - maxReadShare = currentReadShare; // distribution can be incorrect + private void updateTrafficFraction(Instance instance, Deployment deployment, DeploymentSpec deploymentSpec) { + // maxReadShare / currentReadShare = how much additional traffic must the zone be able to handle + double currentReadShare = 0; // How much of the total traffic of the group(s) this is a member of does this deployment receive + double maxReadShare = 0; // How much of the total traffic of the group(s) this is a member of might this deployment receive if a member of the group fails + for (BcpGroup group : BcpGroup.groupsFrom(instance, deploymentSpec)) { + if ( ! group.contains(deployment.zone().region())) continue; + double deploymentQps = deployment.metrics().queriesPerSecond(); + double groupQps = group.totalQps(); + double fraction = group.fraction(deployment.zone().region()); + currentReadShare += groupQps == 0 ? 0 : fraction * deploymentQps / groupQps; + maxReadShare += group.size() == 1 + ? currentReadShare + : fraction * ( deploymentQps + group.maxQpsExcluding(deployment.zone().region()) / (group.size() - 1) ) / groupQps; + } nodeRepository.patchApplication(deployment.zone(), instance.id(), currentReadShare, maxReadShare); } + /** + * A set of regions which will take over traffic from each other if one of them fails. + * Each region will take an equal share (modulated by fraction) of the failing region's traffic. + * + * A regions membership in a group may be partial, represented by a fraction [0, 1], + * in which case the other regions will collectively only take that fraction of the failing regions traffic, + * and symmetrically, the region will only take its fraction of its share of traffic of any other failing region. + */ + private static class BcpGroup { + + /** The instance which has this group. */ + private final Instance instance; + + /** Regions in this group, with their fractions. */ + private final Map<RegionName, Double> regions; + + /** Creates a group of a subset of the deployments in this instance. */ + private BcpGroup(Instance instance, Map<RegionName, Double> regions) { + this.instance = instance; + this.regions = regions; + } + + /** Returns the sum of the fractional memberships of this. */ + double size() { + return regions.values().stream().mapToDouble(f -> f).sum(); + } + + double fraction(RegionName region) { + return regions.getOrDefault(region, 0.0); + } + + boolean contains(RegionName region) { + return regions.containsKey(region); + } + + double totalQps() { + return instance.productionDeployments().values().stream() + .mapToDouble(i -> i.metrics().queriesPerSecond()).sum(); + } + + double maxQpsExcluding(RegionName region) { + return instance.productionDeployments().values().stream() + .filter(d -> ! d.zone().region().equals(region)) + .mapToDouble(d -> d.metrics().queriesPerSecond() * fraction(d.zone().region())) + .max() + .orElse(0); + } + private static Bcp bcpOf(InstanceName instanceName, DeploymentSpec deploymentSpec) { + var instanceSpec = deploymentSpec.instance(instanceName); + if (instanceSpec.isEmpty()) return deploymentSpec.bcp(); + return instanceSpec.get().bcp().orElse(deploymentSpec.bcp()); + } + + private static Map<RegionName, Double> regionsFrom(Instance instance) { + return instance.productionDeployments().values().stream() + .collect(Collectors.toMap(deployment -> deployment.zone().region(), __ -> 1.0)); + } + + private static Map<RegionName, Double> regionsFrom(Bcp.Group groupSpec) { + return groupSpec.members().stream() + .collect(Collectors.toMap(member -> member.region(), member -> member.fraction())); + } + + static List<BcpGroup> groupsFrom(Instance instance, DeploymentSpec deploymentSpec) { + Bcp bcp = bcpOf(instance.name(), deploymentSpec); + if (bcp.isEmpty()) + return List.of(new BcpGroup(instance, regionsFrom(instance))); + return bcp.groups().stream().map(groupSpec -> new BcpGroup(instance, regionsFrom(groupSpec))).toList(); + } + + } + } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java index 14835a822e6..b712746b663 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java @@ -15,6 +15,7 @@ import com.yahoo.security.KeyUtils; import com.yahoo.security.SignatureAlgorithm; import com.yahoo.security.X509CertificateBuilder; import com.yahoo.vespa.hosted.controller.Application; +import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.ControllerTester; import com.yahoo.vespa.hosted.controller.Instance; import com.yahoo.vespa.hosted.controller.api.identifiers.DeploymentId; @@ -52,6 +53,7 @@ import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Optional; +import java.util.OptionalLong; import java.util.Set; import java.util.concurrent.atomic.AtomicLong; import java.util.function.Supplier; @@ -196,8 +198,9 @@ public class DeploymentContext { Application application = application(); assertTrue(application.revisions().last().isPresent(), "Application package submitted"); assertFalse(application.instances().values().stream() - .anyMatch(instance -> instance.deployments().values().stream() - .anyMatch(deployment -> deployment.revision().equals(lastSubmission))), "Submission is not already deployed"); + .anyMatch(instance -> instance.deployments().values().stream() + .anyMatch(deployment -> deployment.revision().equals(lastSubmission))), + "Submission is not already deployed"); completeRollout(application.deploymentSpec().instances().size() > 1); for (var instance : application().instances().values()) { assertFalse(instance.change().hasTargets()); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java index fad85ef9b48..5c26e270846 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdaterTest.java @@ -2,8 +2,10 @@ package com.yahoo.vespa.hosted.controller.maintenance; import com.yahoo.component.Version; +import com.yahoo.config.application.api.xml.DeploymentSpecXmlReader; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.zone.ZoneId; +import com.yahoo.vespa.hosted.controller.Application; import com.yahoo.vespa.hosted.controller.api.application.v4.model.ClusterMetrics; import com.yahoo.vespa.hosted.controller.api.identifiers.DeploymentId; import com.yahoo.vespa.hosted.controller.application.pkg.ApplicationPackage; @@ -14,6 +16,7 @@ import org.junit.jupiter.api.Test; import java.time.Duration; import java.util.Map; +import java.util.OptionalLong; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -25,61 +28,184 @@ import static org.junit.jupiter.api.Assertions.assertEquals; public class TrafficShareUpdaterTest { @Test - void testTrafficUpdater() { + void testTrafficUpdaterImplicitBcp() { DeploymentTester tester = new DeploymentTester(); Version version = Version.fromString("7.1"); - tester.controllerTester().upgradeSystem(version); - var application = tester.newDeploymentContext(); + tester.controllerTester().upgradeSystem(Version.fromString("7.1")); + var context = tester.newDeploymentContext(); var deploymentMetricsMaintainer = new DeploymentMetricsMaintainer(tester.controller(), Duration.ofDays(1)); var updater = new TrafficShareUpdater(tester.controller(), Duration.ofDays(1)); ZoneId prod1 = ZoneId.from("prod", "ap-northeast-1"); ZoneId prod2 = ZoneId.from("prod", "us-east-3"); ZoneId prod3 = ZoneId.from("prod", "us-west-1"); - application.runJob(DeploymentContext.productionApNortheast1, new ApplicationPackage(new byte[0]), version); + context.runJob(DeploymentContext.perfUsEast3, new ApplicationPackage(new byte[0]), version); // Ignored + context.runJob(DeploymentContext.productionApNortheast1, new ApplicationPackage(new byte[0]), version); - // Single zone - setQpsMetric(50.0, application.application().id().defaultInstance(), prod1, tester); + // One zone + context.runJob(DeploymentContext.productionApNortheast1, new ApplicationPackage(new byte[0]), version); + setQpsMetric(50.0, context.application().id().defaultInstance(), prod1, tester); deploymentMetricsMaintainer.maintain(); assertEquals(1.0, updater.maintain(), 0.0000001); - assertTrafficFraction(1.0, 1.0, application.instanceId(), prod1, tester); + assertTrafficFraction(1.0, 1.0, context.instanceId(), prod1, tester); // Two zones - application.runJob(DeploymentContext.productionUsEast3, new ApplicationPackage(new byte[0]), version); - // - one cold - setQpsMetric(50.0, application.application().id().defaultInstance(), prod1, tester); - setQpsMetric(0.0, application.application().id().defaultInstance(), prod2, tester); + context.runJob(DeploymentContext.productionUsEast3, new ApplicationPackage(new byte[0]), version); + setQpsMetric(60.0, context.application().id().defaultInstance(), prod1, tester); + setQpsMetric(20.0, context.application().id().defaultInstance(), prod2, tester); deploymentMetricsMaintainer.maintain(); assertEquals(1.0, updater.maintain(), 0.0000001); - assertTrafficFraction(1.0, 1.0, application.instanceId(), prod1, tester); - assertTrafficFraction(0.0, 1.0, application.instanceId(), prod2, tester); - // - both hot - setQpsMetric(53.0, application.application().id().defaultInstance(), prod1, tester); - setQpsMetric(47.0, application.application().id().defaultInstance(), prod2, tester); + assertTrafficFraction(0.75, 1.0, context.instanceId(), prod1, tester); + assertTrafficFraction(0.25, 1.0, context.instanceId(), prod2, tester); + + // Three zones + context.runJob(DeploymentContext.productionUsWest1, new ApplicationPackage(new byte[0]), version); + setQpsMetric(53.0, context.application().id().defaultInstance(), prod1, tester); + setQpsMetric(45.0, context.application().id().defaultInstance(), prod2, tester); + setQpsMetric(02.0, context.application().id().defaultInstance(), prod3, tester); deploymentMetricsMaintainer.maintain(); assertEquals(1.0, updater.maintain(), 0.0000001); - assertTrafficFraction(0.53, 1.0, application.instanceId(), prod1, tester); - assertTrafficFraction(0.47, 1.0, application.instanceId(), prod2, tester); + assertTrafficFraction(0.53, 0.53 + (double)45/2 / 100, context.instanceId(), prod1, tester); + assertTrafficFraction(0.45, 0.45 + (double)53/2 / 100, context.instanceId(), prod2, tester); + assertTrafficFraction(0.02, 0.02 + (double)53/2 / 100, context.instanceId(), prod3, tester); + } + + @Test + void testTrafficUpdaterHotCold() { + var spec = """ + <deployment version="1.0"> + <staging/> + <prod> + <region>ap-northeast-1</region> + <region>ap-southeast-1</region> + <region>us-east-3</region> + <region>us-central-1</region> + <region>eu-west-1</region> + </prod> + <bcp> + <group> + <region>ap-northeast-1</region> + <region>ap-southeast-1</region> + </group> + <group> + <region>us-east-3</region> + <region>us-central-1</region> + </group> + <group> + <region>eu-west-1</region> + </group> + </bcp> + </deployment> + """; + + DeploymentTester tester = new DeploymentTester(); + Version version = Version.fromString("7.1"); + tester.controllerTester().upgradeSystem(Version.fromString("7.1")); + var context = tester.newDeploymentContext(); + var deploymentSpec = new DeploymentSpecXmlReader(true).read(spec); + tester.controller().applications() + .lockApplicationOrThrow(context.application().id(), + locked -> tester.controller().applications().store(locked.with(deploymentSpec))); + + var deploymentMetricsMaintainer = new DeploymentMetricsMaintainer(tester.controller(), Duration.ofDays(1)); + var updater = new TrafficShareUpdater(tester.controller(), Duration.ofDays(1)); + + ZoneId ap1 = ZoneId.from("prod", "ap-northeast-1"); + ZoneId ap2 = ZoneId.from("prod", "ap-southeast-1"); + ZoneId us1 = ZoneId.from("prod", "us-east-3"); + ZoneId us2 = ZoneId.from("prod", "us-central-1"); + ZoneId eu1 = ZoneId.from("prod", "eu-west-1"); + + context.runJob(DeploymentContext.productionApNortheast1, new ApplicationPackage(new byte[0]), version); + context.runJob(DeploymentContext.productionApSoutheast1, new ApplicationPackage(new byte[0]), version); + context.runJob(DeploymentContext.productionUsEast3, new ApplicationPackage(new byte[0]), version); + context.runJob(DeploymentContext.productionUsCentral1, new ApplicationPackage(new byte[0]), version); + context.runJob(DeploymentContext.productionEuWest1, new ApplicationPackage(new byte[0]), version); + + setQpsMetric(50.0, context.application().id().defaultInstance(), ap1, tester); + setQpsMetric(00.0, context.application().id().defaultInstance(), ap2, tester); + setQpsMetric(10.0, context.application().id().defaultInstance(), us1, tester); + setQpsMetric(00.0, context.application().id().defaultInstance(), us2, tester); + setQpsMetric(40.0, context.application().id().defaultInstance(), eu1, tester); - // Three zones - application.runJob(DeploymentContext.productionUsWest1, new ApplicationPackage(new byte[0]), version); - // - one cold - setQpsMetric(53.0, application.application().id().defaultInstance(), prod1, tester); - setQpsMetric(47.0, application.application().id().defaultInstance(), prod2, tester); - setQpsMetric(0.0, application.application().id().defaultInstance(), prod3, tester); deploymentMetricsMaintainer.maintain(); assertEquals(1.0, updater.maintain(), 0.0000001); - assertTrafficFraction(0.53, 0.53, application.instanceId(), prod1, tester); - assertTrafficFraction(0.47, 0.50, application.instanceId(), prod2, tester); - assertTrafficFraction(0.00, 0.50, application.instanceId(), prod3, tester); - // - all hot - setQpsMetric(50.0, application.application().id().defaultInstance(), prod1, tester); - setQpsMetric(25.0, application.application().id().defaultInstance(), prod2, tester); - setQpsMetric(25.0, application.application().id().defaultInstance(), prod3, tester); + assertTrafficFraction(0.5, 0.5, context.instanceId(), ap1, tester); + assertTrafficFraction(0.0, 0.5, context.instanceId(), ap2, tester); + assertTrafficFraction(0.1, 0.1, context.instanceId(), us1, tester); + assertTrafficFraction(0.0, 0.1, context.instanceId(), us2, tester); + assertTrafficFraction(0.4, 0.4, context.instanceId(), eu1, tester); + } + + @Test + void testTrafficUpdaterOverlappingGroups() { + var spec = """ + <deployment version="1.0"> + <staging/> + <prod> + <region>ap-northeast-1</region> + <region>ap-southeast-1</region> + <region>us-east-3</region> + <region>us-central-1</region> + <region>us-west-1</region> + <region>eu-west-1</region> + </prod> + <bcp> + <group> + <region>ap-northeast-1</region> + <region>ap-southeast-1</region> + <region fraction="0.5">eu-west-1</region> + </group> + <group> + <region>us-east-3</region> + <region>us-central-1</region> + <region>us-west-1</region> + <region fraction="0.5">eu-west-1</region> + </group> + </bcp> + </deployment> + """; + + DeploymentTester tester = new DeploymentTester(); + Version version = Version.fromString("7.1"); + tester.controllerTester().upgradeSystem(Version.fromString("7.1")); + var context = tester.newDeploymentContext(); + var deploymentSpec = new DeploymentSpecXmlReader(true).read(spec); + tester.controller().applications() + .lockApplicationOrThrow(context.application().id(), + locked -> tester.controller().applications().store(locked.with(deploymentSpec))); + + var deploymentMetricsMaintainer = new DeploymentMetricsMaintainer(tester.controller(), Duration.ofDays(1)); + var updater = new TrafficShareUpdater(tester.controller(), Duration.ofDays(1)); + + ZoneId ap1 = ZoneId.from("prod", "ap-northeast-1"); + ZoneId ap2 = ZoneId.from("prod", "ap-southeast-1"); + ZoneId us1 = ZoneId.from("prod", "us-east-3"); + ZoneId us2 = ZoneId.from("prod", "us-central-1"); + ZoneId us3 = ZoneId.from("prod", "us-west-1"); + ZoneId eu1 = ZoneId.from("prod", "eu-west-1"); + + context.runJob(DeploymentContext.productionApNortheast1, new ApplicationPackage(new byte[0]), version); + context.runJob(DeploymentContext.productionApSoutheast1, new ApplicationPackage(new byte[0]), version); + context.runJob(DeploymentContext.productionUsEast3, new ApplicationPackage(new byte[0]), version); + context.runJob(DeploymentContext.productionUsCentral1, new ApplicationPackage(new byte[0]), version); + context.runJob(DeploymentContext.productionUsWest1, new ApplicationPackage(new byte[0]), version); + context.runJob(DeploymentContext.productionEuWest1, new ApplicationPackage(new byte[0]), version); + + setQpsMetric(20.0, context.application().id().defaultInstance(), ap1, tester); + setQpsMetric(50.0, context.application().id().defaultInstance(), ap2, tester); + setQpsMetric(00.0, context.application().id().defaultInstance(), us1, tester); + setQpsMetric(30.0, context.application().id().defaultInstance(), us2, tester); + setQpsMetric(40.0, context.application().id().defaultInstance(), us3, tester); + setQpsMetric(60.0, context.application().id().defaultInstance(), eu1, tester); + deploymentMetricsMaintainer.maintain(); assertEquals(1.0, updater.maintain(), 0.0000001); - assertTrafficFraction(0.50, 0.5, application.instanceId(), prod1, tester); - assertTrafficFraction(0.25, 0.5, application.instanceId(), prod2, tester); - assertTrafficFraction(0.25, 0.5, application.instanceId(), prod3, tester); + assertTrafficFraction(0.10, 0.10 + 50 / 200.0 / 1.5, context.instanceId(), ap1, tester); + assertTrafficFraction(0.25, 0.25 + 30 / 200.0 / 1.5, context.instanceId(), ap2, tester); + assertTrafficFraction(0.00, 0.00 + 40 / 200.0 / 2.5, context.instanceId(), us1, tester); + assertTrafficFraction(0.15, 0.15 + 40 / 200.0 / 2.5, context.instanceId(), us2, tester); + assertTrafficFraction(0.20, 0.20 + 30 / 200.0 / 2.5, context.instanceId(), us3, tester); + assertTrafficFraction(0.30, 0.30 + 0.5 * 50 / 200.0 / 1.5 + 0.5 * 40 / 200.0 / 2.5, context.instanceId(), eu1, tester); } private void setQpsMetric(double qps, ApplicationId application, ZoneId zone, DeploymentTester tester) { @@ -90,8 +216,8 @@ public class TrafficShareUpdaterTest { private void assertTrafficFraction(double currentReadShare, double maxReadShare, ApplicationId application, ZoneId zone, DeploymentTester tester) { NodeRepositoryMock mock = (NodeRepositoryMock)tester.controller().serviceRegistry().configServer().nodeRepository(); - assertEquals(currentReadShare, mock.getTrafficFraction(application, zone).getFirst(), 0.00001); - assertEquals(maxReadShare, mock.getTrafficFraction(application, zone).getSecond(), 0.00001); + assertEquals(currentReadShare, mock.getTrafficFraction(application, zone).getFirst(), 0.00001, "Current read share"); + assertEquals(maxReadShare, mock.getTrafficFraction(application, zone).getSecond(), 0.00001, "Max read share"); } } |