aboutsummaryrefslogtreecommitdiffstats
path: root/controller-server/src/main/java/com/yahoo
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2023-02-07 13:52:31 +0100
committerGitHub <noreply@github.com>2023-02-07 13:52:31 +0100
commitf38de98559a5a3dd2ed7586bf689964db47bccc9 (patch)
tree00166c2e8f1a51e0af8a030eec7f0644ff32453e /controller-server/src/main/java/com/yahoo
parent628ab7604f016e4062a1e937436d35a8354a8dcc (diff)
parent9be10074a8b0e31c9dfcdcb2d8e9048c86b1fadc (diff)
Merge pull request #25912 from vespa-engine/bratseth/bcp-rebased
Bratseth/bcp rebased
Diffstat (limited to 'controller-server/src/main/java/com/yahoo')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java111
1 files changed, 96 insertions, 15 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java
index 832dbb6b921..aea01ae36d3 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java
@@ -1,6 +1,10 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.controller.maintenance;
+import com.yahoo.config.application.api.Bcp;
+import com.yahoo.config.application.api.DeploymentSpec;
+import com.yahoo.config.provision.InstanceName;
+import com.yahoo.config.provision.RegionName;
import com.yahoo.vespa.hosted.controller.ApplicationController;
import com.yahoo.vespa.hosted.controller.Controller;
import com.yahoo.vespa.hosted.controller.Instance;
@@ -8,7 +12,11 @@ import com.yahoo.vespa.hosted.controller.api.integration.configserver.NodeReposi
import com.yahoo.vespa.hosted.controller.application.Deployment;
import java.time.Duration;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
import java.util.logging.Level;
+import java.util.stream.Collectors;
/**
* This computes, for every application deployment
@@ -41,12 +49,11 @@ public class TrafficShareUpdater extends ControllerMaintainer {
int failures = 0;
for (var application : applications.asList()) {
for (var instance : application.instances().values()) {
- for (var deployment : instance.deployments().values()) {
- if ( ! deployment.zone().environment().isProduction()) continue;
+ for (var deployment : instance.productionDeployments().values()) {
if (shuttingDown()) return 1.0;
try {
attempts++;
- updateTrafficFraction(instance, deployment);
+ updateTrafficFraction(instance, deployment, application.deploymentSpec());
}
catch (Exception e) {
// Some failures due to locked applications are expected and benign
@@ -62,20 +69,94 @@ public class TrafficShareUpdater extends ControllerMaintainer {
return successFactor;
}
- private void updateTrafficFraction(Instance instance, Deployment deployment) {
- double qpsInZone = deployment.metrics().queriesPerSecond();
- double totalQps = instance.deployments().values().stream()
- .filter(i -> i.zone().environment().isProduction())
- .mapToDouble(i -> i.metrics().queriesPerSecond()).sum();
- long prodRegions = instance.deployments().values().stream()
- .filter(i -> i.zone().environment().isProduction())
- .count();
- double currentReadShare = totalQps == 0 ? 0 : qpsInZone / totalQps;
- double maxReadShare = prodRegions < 2 ? 1.0 : 1.0 / ( prodRegions - 1.0);
- if (currentReadShare > maxReadShare) // This can happen because the assumption of equal traffic
- maxReadShare = currentReadShare; // distribution can be incorrect
+ private void updateTrafficFraction(Instance instance, Deployment deployment, DeploymentSpec deploymentSpec) {
+ // maxReadShare / currentReadShare = how much additional traffic must the zone be able to handle
+ double currentReadShare = 0; // How much of the total traffic of the group(s) this is a member of does this deployment receive
+ double maxReadShare = 0; // How much of the total traffic of the group(s) this is a member of might this deployment receive if a member of the group fails
+ for (BcpGroup group : BcpGroup.groupsFrom(instance, deploymentSpec)) {
+ if ( ! group.contains(deployment.zone().region())) continue;
+ double deploymentQps = deployment.metrics().queriesPerSecond();
+ double groupQps = group.totalQps();
+ double fraction = group.fraction(deployment.zone().region());
+ currentReadShare += groupQps == 0 ? 0 : fraction * deploymentQps / groupQps;
+ maxReadShare += group.size() == 1
+ ? currentReadShare
+ : fraction * ( deploymentQps + group.maxQpsExcluding(deployment.zone().region()) / (group.size() - 1) ) / groupQps;
+ }
nodeRepository.patchApplication(deployment.zone(), instance.id(), currentReadShare, maxReadShare);
}
+ /**
+ * A set of regions which will take over traffic from each other if one of them fails.
+ * Each region will take an equal share (modulated by fraction) of the failing region's traffic.
+ *
+ * A regions membership in a group may be partial, represented by a fraction [0, 1],
+ * in which case the other regions will collectively only take that fraction of the failing regions traffic,
+ * and symmetrically, the region will only take its fraction of its share of traffic of any other failing region.
+ */
+ private static class BcpGroup {
+
+ /** The instance which has this group. */
+ private final Instance instance;
+
+ /** Regions in this group, with their fractions. */
+ private final Map<RegionName, Double> regions;
+
+ /** Creates a group of a subset of the deployments in this instance. */
+ private BcpGroup(Instance instance, Map<RegionName, Double> regions) {
+ this.instance = instance;
+ this.regions = regions;
+ }
+
+ /** Returns the sum of the fractional memberships of this. */
+ double size() {
+ return regions.values().stream().mapToDouble(f -> f).sum();
+ }
+
+ double fraction(RegionName region) {
+ return regions.getOrDefault(region, 0.0);
+ }
+
+ boolean contains(RegionName region) {
+ return regions.containsKey(region);
+ }
+
+ double totalQps() {
+ return instance.productionDeployments().values().stream()
+ .mapToDouble(i -> i.metrics().queriesPerSecond()).sum();
+ }
+
+ double maxQpsExcluding(RegionName region) {
+ return instance.productionDeployments().values().stream()
+ .filter(d -> ! d.zone().region().equals(region))
+ .mapToDouble(d -> d.metrics().queriesPerSecond() * fraction(d.zone().region()))
+ .max()
+ .orElse(0);
+ }
+ private static Bcp bcpOf(InstanceName instanceName, DeploymentSpec deploymentSpec) {
+ var instanceSpec = deploymentSpec.instance(instanceName);
+ if (instanceSpec.isEmpty()) return deploymentSpec.bcp();
+ return instanceSpec.get().bcp().orElse(deploymentSpec.bcp());
+ }
+
+ private static Map<RegionName, Double> regionsFrom(Instance instance) {
+ return instance.productionDeployments().values().stream()
+ .collect(Collectors.toMap(deployment -> deployment.zone().region(), __ -> 1.0));
+ }
+
+ private static Map<RegionName, Double> regionsFrom(Bcp.Group groupSpec) {
+ return groupSpec.members().stream()
+ .collect(Collectors.toMap(member -> member.region(), member -> member.fraction()));
+ }
+
+ static List<BcpGroup> groupsFrom(Instance instance, DeploymentSpec deploymentSpec) {
+ Bcp bcp = bcpOf(instance.name(), deploymentSpec);
+ if (bcp.isEmpty())
+ return List.of(new BcpGroup(instance, regionsFrom(instance)));
+ return bcp.groups().stream().map(groupSpec -> new BcpGroup(instance, regionsFrom(groupSpec))).toList();
+ }
+
+ }
+
}