summaryrefslogtreecommitdiffstats
path: root/controller-server/src/main/java/com/yahoo
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2023-02-15 13:52:23 +0100
committerJon Bratseth <bratseth@gmail.com>2023-02-15 13:52:23 +0100
commit4c9206d8119d1131e248419c7e1ba669c396b89b (patch)
tree414dfdc40c088e06c108e28a7f050bf375ce9d3b /controller-server/src/main/java/com/yahoo
parentb9b7e3cf8529e6f7e9904c1013174e37c0460696 (diff)
Exchange BCP info WIP
Diffstat (limited to 'controller-server/src/main/java/com/yahoo')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/BcpGroupUpdater.java (renamed from controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java)142
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java2
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java4
3 files changed, 133 insertions, 15 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/BcpGroupUpdater.java
index 80647e6ea0a..1ef94ce527c 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/BcpGroupUpdater.java
@@ -3,40 +3,50 @@ package com.yahoo.vespa.hosted.controller.maintenance;
import com.yahoo.config.application.api.Bcp;
import com.yahoo.config.application.api.DeploymentSpec;
+import com.yahoo.config.provision.ApplicationId;
+import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.InstanceName;
import com.yahoo.config.provision.RegionName;
import com.yahoo.vespa.hosted.controller.ApplicationController;
import com.yahoo.vespa.hosted.controller.Controller;
import com.yahoo.vespa.hosted.controller.Instance;
import com.yahoo.vespa.hosted.controller.api.integration.configserver.NodeRepository;
+import com.yahoo.vespa.hosted.controller.api.integration.noderepository.ApplicationPatch;
import com.yahoo.vespa.hosted.controller.application.Deployment;
import java.time.Duration;
-import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
import java.util.logging.Level;
import java.util.stream.Collectors;
+import java.util.stream.Stream;
/**
* This computes, for every application deployment
- * - the current fraction of the application's global traffic it receives
- * - the max fraction it can possibly receive, assuming traffic is evenly distributed over regions
- * and max one region is down at any time. (We can let deployment.xml override these assumptions later).
+ * - the current fraction of the application's global traffic it receives.
+ * - the max fraction it can possibly receive, given its BCP group membership.
+ * - for each cluster in the deployment, average statistics from the other members in the group.
*
- * These two numbers are sent to a config server of each region where it is ultimately
- * consumed by autoscaling.
+ * These values are sent to a config server of each region where it is consumed by autoscaling.
*
* It depends on the traffic metrics collected by DeploymentMetricsMaintainer.
*
* @author bratseth
*/
-public class TrafficShareUpdater extends ControllerMaintainer {
+public class BcpGroupUpdater extends ControllerMaintainer {
private final ApplicationController applications;
private final NodeRepository nodeRepository;
- public TrafficShareUpdater(Controller controller, Duration duration) {
+ /** BCP group info for each application. It is not critical to update this often so stored in memory only. */
+ final Map<ApplicationId, ApplicationClusterDeploymentMetrics> metrics = new ConcurrentHashMap<>(); // TODO: Make private
+
+ public BcpGroupUpdater(Controller controller, Duration duration) {
super(controller, duration);
this.applications = controller.applications();
this.nodeRepository = controller.serviceRegistry().configServer().nodeRepository();
@@ -53,7 +63,11 @@ public class TrafficShareUpdater extends ControllerMaintainer {
if (shuttingDown()) return 1.0;
try {
attempts++;
- updateTrafficFraction(instance, deployment, application.deploymentSpec());
+ var bcpGroups = BcpGroup.groupsFrom(instance, application.deploymentSpec());
+ var patch = new ApplicationPatch();
+ addTrafficShare(deployment, bcpGroups, patch);
+ addBcpGroupInfo(instance, deployment.zone().region(), bcpGroups, patch);
+ nodeRepository.patchApplication(deployment.zone(), instance.id(), patch);
}
catch (Exception e) {
// Some failures due to locked applications are expected and benign
@@ -71,11 +85,12 @@ public class TrafficShareUpdater extends ControllerMaintainer {
return successFactor;
}
- private void updateTrafficFraction(Instance instance, Deployment deployment, DeploymentSpec deploymentSpec) {
+ /** Adds deployment traffic share to the given patch. */
+ private void addTrafficShare(Deployment deployment, List<BcpGroup> bcpGroups, ApplicationPatch patch) {
// maxReadShare / currentReadShare = how much additional traffic must the zone be able to handle
double currentReadShare = 0; // How much of the total traffic of the group(s) this is a member of does this deployment receive
double maxReadShare = 0; // How much of the total traffic of the group(s) this is a member of might this deployment receive if a member of the group fails
- for (BcpGroup group : BcpGroup.groupsFrom(instance, deploymentSpec)) {
+ for (BcpGroup group : bcpGroups) {
if ( ! group.contains(deployment.zone().region())) continue;
double deploymentQps = deployment.metrics().queriesPerSecond();
@@ -86,7 +101,36 @@ public class TrafficShareUpdater extends ControllerMaintainer {
? currentReadShare
: fraction * ( deploymentQps + group.maxQpsExcluding(deployment.zone().region()) / (group.size() - 1) ) / groupQps;
}
- nodeRepository.patchApplication(deployment.zone(), instance.id(), currentReadShare, maxReadShare);
+ patch.currentReadShare = currentReadShare;
+ patch.maxReadShare = maxReadShare;
+ }
+
+ /** Adds bcp group info to the given patch, for any clusters where we have information. */
+ private void addBcpGroupInfo(Instance instance, RegionName regionToUpdate, List<BcpGroup> bcpGroups, ApplicationPatch patch) {
+
+ var applicationMetrics = metrics.get(instance.id());
+ if (applicationMetrics == null) return;
+ for (var clusterEntry : applicationMetrics.clusterDeploymentMetrics.entrySet()) {
+ addClusterBcpGroupInfo(clusterEntry.getKey(), clusterEntry.getValue(), instance, regionToUpdate, bcpGroups, patch);
+ }
+ }
+
+ private void addClusterBcpGroupInfo(ClusterSpec.Id id, ClusterDeploymentMetrics metrics, Instance instance,
+ RegionName regionToUpdate, List<BcpGroup> bcpGroups, ApplicationPatch patch) {
+ var weightedSumOfMaxMetrics = DeploymentMetrics.empty();
+ double sumOfCompleteMemberships = 0;
+ for (BcpGroup bcpGroup : bcpGroups) {
+ if ( ! bcpGroup.contains(regionToUpdate)) continue;
+ var groupMetrics = metrics.subsetOf(bcpGroup);
+ if ( ! groupMetrics.isCompleteExcluding(regionToUpdate, bcpGroup)) continue;
+ var max = groupMetrics.maxQueryRateExcluding(regionToUpdate, bcpGroup);
+ if (max.isEmpty()) continue;
+
+ weightedSumOfMaxMetrics = weightedSumOfMaxMetrics.add(max.get().multipliedBy(bcpGroup.fraction(regionToUpdate)));
+ sumOfCompleteMemberships += bcpGroup.fraction(regionToUpdate);
+ }
+ if (sumOfCompleteMemberships > 0)
+ patch.clusters.put(id.value(), weightedSumOfMaxMetrics.dividedBy(sumOfCompleteMemberships).asClusterPatch());
}
/**
@@ -116,6 +160,8 @@ public class TrafficShareUpdater extends ControllerMaintainer {
return regions.values().stream().mapToDouble(f -> f).sum();
}
+ Set<RegionName> regions() { return regions.keySet(); }
+
double fraction(RegionName region) {
return regions.getOrDefault(region, 0.0);
}
@@ -136,6 +182,7 @@ public class TrafficShareUpdater extends ControllerMaintainer {
.max()
.orElse(0);
}
+
private static Bcp bcpOf(InstanceName instanceName, DeploymentSpec deploymentSpec) {
var instanceSpec = deploymentSpec.instance(instanceName);
if (instanceSpec.isEmpty()) return deploymentSpec.bcp();
@@ -161,4 +208,75 @@ public class TrafficShareUpdater extends ControllerMaintainer {
}
+ static class ApplicationClusterDeploymentMetrics {
+
+ final Map<ClusterSpec.Id, ClusterDeploymentMetrics> clusterDeploymentMetrics = new ConcurrentHashMap<>(); // TODO: Make private
+
+ }
+
+ static class ClusterDeploymentMetrics {
+
+ private final Map<RegionName, DeploymentMetrics> deploymentMetrics;
+
+ public ClusterDeploymentMetrics(Map<RegionName, DeploymentMetrics> deploymentMetrics) {
+ this.deploymentMetrics = new ConcurrentHashMap<>(deploymentMetrics);
+ }
+
+ void put(RegionName region, DeploymentMetrics metrics) {
+ deploymentMetrics.put(region, metrics);
+ }
+
+ ClusterDeploymentMetrics subsetOf(BcpGroup group) {
+ Map<RegionName, DeploymentMetrics> filteredMetrics = new HashMap<>();
+ for (var entry : deploymentMetrics.entrySet()) {
+ if (group.contains(entry.getKey()))
+ filteredMetrics.put(entry.getKey(), entry.getValue());
+ }
+ return new ClusterDeploymentMetrics(filteredMetrics);
+ }
+
+ /** Returns whether this has deployment metrics for each of the deployments in the given instance. */
+ boolean isCompleteExcluding(RegionName regionToExclude, BcpGroup bcpGroup) {
+ return regionsExcluding(regionToExclude, bcpGroup).allMatch(region -> deploymentMetrics.containsKey(region));
+ }
+
+ /** Returns the metrics with the max query rate among the given instance, if any. */
+ Optional<DeploymentMetrics> maxQueryRateExcluding(RegionName regionToExclude, BcpGroup bcpGroup) {
+ return regionsExcluding(regionToExclude, bcpGroup)
+ .map(region -> deploymentMetrics.get(region))
+ .max(Comparator.comparingDouble(m -> m.queryRate));
+ }
+
+ private Stream<RegionName> regionsExcluding(RegionName regionToExclude, BcpGroup bcpGroup) {
+ return bcpGroup.regions().stream()
+ .filter(region -> ! region.equals(regionToExclude));
+ }
+
+ }
+
+ /** Metrics for a given application, cluster and deployment. */
+ record DeploymentMetrics(double queryRate, double growthRateHeadroom, double cpuCostPerQuery) {
+
+ public ApplicationPatch.ClusterPatch asClusterPatch() {
+ return new ApplicationPatch.ClusterPatch(new ApplicationPatch.BcpGroupInfo(queryRate, growthRateHeadroom, cpuCostPerQuery));
+ }
+
+ DeploymentMetrics dividedBy(double d) {
+ return new DeploymentMetrics(queryRate / d, growthRateHeadroom / d, cpuCostPerQuery / d);
+ }
+
+ DeploymentMetrics multipliedBy(double m) {
+ return new DeploymentMetrics(queryRate * m, growthRateHeadroom * m, cpuCostPerQuery * m);
+ }
+
+ DeploymentMetrics add(DeploymentMetrics other) {
+ return new DeploymentMetrics(queryRate + other.queryRate,
+ growthRateHeadroom + other.growthRateHeadroom,
+ cpuCostPerQuery + other.cpuCostPerQuery);
+ }
+
+ public static DeploymentMetrics empty() { return new DeploymentMetrics(0, 0, 0); }
+
+ }
+
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java
index 241f2a83d6f..b64de5d5af4 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java
@@ -67,7 +67,7 @@ public class ControllerMaintenance extends AbstractComponent {
maintainers.add(new HostInfoUpdater(controller, intervals.hostInfoUpdater));
maintainers.add(new ReindexingTriggerer(controller, intervals.reindexingTriggerer));
maintainers.add(new EndpointCertificateMaintainer(controller, intervals.endpointCertificateMaintainer));
- maintainers.add(new TrafficShareUpdater(controller, intervals.trafficFractionUpdater));
+ maintainers.add(new BcpGroupUpdater(controller, intervals.trafficFractionUpdater));
maintainers.add(new ArchiveUriUpdater(controller, intervals.archiveUriUpdater));
maintainers.add(new ArchiveAccessMaintainer(controller, metric, intervals.archiveAccessMaintainer));
maintainers.add(new TenantRoleMaintainer(controller, intervals.tenantRoleMaintainer));
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java
index 322c78aa7c1..fa917d2eb4e 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java
@@ -25,8 +25,8 @@ import java.util.logging.Level;
import java.util.logging.Logger;
/**
- * Retrieves deployment metrics such as QPS and document count from the metric service and
- * updates applications with this info.
+ * Retrieves deployment metrics such as QPS and document count over the config server API
+ * and updates application objects in the controller with this info.
*
* @author smorgrav
* @author mpolden