diff options
author | Jon Bratseth <bratseth@gmail.com> | 2023-02-15 13:52:23 +0100 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2023-02-15 13:52:23 +0100 |
commit | 4c9206d8119d1131e248419c7e1ba669c396b89b (patch) | |
tree | 414dfdc40c088e06c108e28a7f050bf375ce9d3b /controller-server/src/main/java/com/yahoo | |
parent | b9b7e3cf8529e6f7e9904c1013174e37c0460696 (diff) |
Exchange BCP info WIP
Diffstat (limited to 'controller-server/src/main/java/com/yahoo')
-rw-r--r-- | controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/BcpGroupUpdater.java (renamed from controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java) | 142 | ||||
-rw-r--r-- | controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java | 2 | ||||
-rw-r--r-- | controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java | 4 |
3 files changed, 133 insertions, 15 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/BcpGroupUpdater.java index 80647e6ea0a..1ef94ce527c 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/TrafficShareUpdater.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/BcpGroupUpdater.java @@ -3,40 +3,50 @@ package com.yahoo.vespa.hosted.controller.maintenance; import com.yahoo.config.application.api.Bcp; import com.yahoo.config.application.api.DeploymentSpec; +import com.yahoo.config.provision.ApplicationId; +import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.InstanceName; import com.yahoo.config.provision.RegionName; import com.yahoo.vespa.hosted.controller.ApplicationController; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.Instance; import com.yahoo.vespa.hosted.controller.api.integration.configserver.NodeRepository; +import com.yahoo.vespa.hosted.controller.api.integration.noderepository.ApplicationPatch; import com.yahoo.vespa.hosted.controller.application.Deployment; import java.time.Duration; -import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; import java.util.logging.Level; import java.util.stream.Collectors; +import java.util.stream.Stream; /** * This computes, for every application deployment - * - the current fraction of the application's global traffic it receives - * - the max fraction it can possibly receive, assuming traffic is evenly distributed over regions - * and max one region is down at any time. (We can let deployment.xml override these assumptions later). + * - the current fraction of the application's global traffic it receives. + * - the max fraction it can possibly receive, given its BCP group membership. + * - for each cluster in the deployment, average statistics from the other members in the group. * - * These two numbers are sent to a config server of each region where it is ultimately - * consumed by autoscaling. + * These values are sent to a config server of each region where it is consumed by autoscaling. * * It depends on the traffic metrics collected by DeploymentMetricsMaintainer. * * @author bratseth */ -public class TrafficShareUpdater extends ControllerMaintainer { +public class BcpGroupUpdater extends ControllerMaintainer { private final ApplicationController applications; private final NodeRepository nodeRepository; - public TrafficShareUpdater(Controller controller, Duration duration) { + /** BCP group info for each application. It is not critical to update this often so stored in memory only. */ + final Map<ApplicationId, ApplicationClusterDeploymentMetrics> metrics = new ConcurrentHashMap<>(); // TODO: Make private + + public BcpGroupUpdater(Controller controller, Duration duration) { super(controller, duration); this.applications = controller.applications(); this.nodeRepository = controller.serviceRegistry().configServer().nodeRepository(); @@ -53,7 +63,11 @@ public class TrafficShareUpdater extends ControllerMaintainer { if (shuttingDown()) return 1.0; try { attempts++; - updateTrafficFraction(instance, deployment, application.deploymentSpec()); + var bcpGroups = BcpGroup.groupsFrom(instance, application.deploymentSpec()); + var patch = new ApplicationPatch(); + addTrafficShare(deployment, bcpGroups, patch); + addBcpGroupInfo(instance, deployment.zone().region(), bcpGroups, patch); + nodeRepository.patchApplication(deployment.zone(), instance.id(), patch); } catch (Exception e) { // Some failures due to locked applications are expected and benign @@ -71,11 +85,12 @@ public class TrafficShareUpdater extends ControllerMaintainer { return successFactor; } - private void updateTrafficFraction(Instance instance, Deployment deployment, DeploymentSpec deploymentSpec) { + /** Adds deployment traffic share to the given patch. */ + private void addTrafficShare(Deployment deployment, List<BcpGroup> bcpGroups, ApplicationPatch patch) { // maxReadShare / currentReadShare = how much additional traffic must the zone be able to handle double currentReadShare = 0; // How much of the total traffic of the group(s) this is a member of does this deployment receive double maxReadShare = 0; // How much of the total traffic of the group(s) this is a member of might this deployment receive if a member of the group fails - for (BcpGroup group : BcpGroup.groupsFrom(instance, deploymentSpec)) { + for (BcpGroup group : bcpGroups) { if ( ! group.contains(deployment.zone().region())) continue; double deploymentQps = deployment.metrics().queriesPerSecond(); @@ -86,7 +101,36 @@ public class TrafficShareUpdater extends ControllerMaintainer { ? currentReadShare : fraction * ( deploymentQps + group.maxQpsExcluding(deployment.zone().region()) / (group.size() - 1) ) / groupQps; } - nodeRepository.patchApplication(deployment.zone(), instance.id(), currentReadShare, maxReadShare); + patch.currentReadShare = currentReadShare; + patch.maxReadShare = maxReadShare; + } + + /** Adds bcp group info to the given patch, for any clusters where we have information. */ + private void addBcpGroupInfo(Instance instance, RegionName regionToUpdate, List<BcpGroup> bcpGroups, ApplicationPatch patch) { + + var applicationMetrics = metrics.get(instance.id()); + if (applicationMetrics == null) return; + for (var clusterEntry : applicationMetrics.clusterDeploymentMetrics.entrySet()) { + addClusterBcpGroupInfo(clusterEntry.getKey(), clusterEntry.getValue(), instance, regionToUpdate, bcpGroups, patch); + } + } + + private void addClusterBcpGroupInfo(ClusterSpec.Id id, ClusterDeploymentMetrics metrics, Instance instance, + RegionName regionToUpdate, List<BcpGroup> bcpGroups, ApplicationPatch patch) { + var weightedSumOfMaxMetrics = DeploymentMetrics.empty(); + double sumOfCompleteMemberships = 0; + for (BcpGroup bcpGroup : bcpGroups) { + if ( ! bcpGroup.contains(regionToUpdate)) continue; + var groupMetrics = metrics.subsetOf(bcpGroup); + if ( ! groupMetrics.isCompleteExcluding(regionToUpdate, bcpGroup)) continue; + var max = groupMetrics.maxQueryRateExcluding(regionToUpdate, bcpGroup); + if (max.isEmpty()) continue; + + weightedSumOfMaxMetrics = weightedSumOfMaxMetrics.add(max.get().multipliedBy(bcpGroup.fraction(regionToUpdate))); + sumOfCompleteMemberships += bcpGroup.fraction(regionToUpdate); + } + if (sumOfCompleteMemberships > 0) + patch.clusters.put(id.value(), weightedSumOfMaxMetrics.dividedBy(sumOfCompleteMemberships).asClusterPatch()); } /** @@ -116,6 +160,8 @@ public class TrafficShareUpdater extends ControllerMaintainer { return regions.values().stream().mapToDouble(f -> f).sum(); } + Set<RegionName> regions() { return regions.keySet(); } + double fraction(RegionName region) { return regions.getOrDefault(region, 0.0); } @@ -136,6 +182,7 @@ public class TrafficShareUpdater extends ControllerMaintainer { .max() .orElse(0); } + private static Bcp bcpOf(InstanceName instanceName, DeploymentSpec deploymentSpec) { var instanceSpec = deploymentSpec.instance(instanceName); if (instanceSpec.isEmpty()) return deploymentSpec.bcp(); @@ -161,4 +208,75 @@ public class TrafficShareUpdater extends ControllerMaintainer { } + static class ApplicationClusterDeploymentMetrics { + + final Map<ClusterSpec.Id, ClusterDeploymentMetrics> clusterDeploymentMetrics = new ConcurrentHashMap<>(); // TODO: Make private + + } + + static class ClusterDeploymentMetrics { + + private final Map<RegionName, DeploymentMetrics> deploymentMetrics; + + public ClusterDeploymentMetrics(Map<RegionName, DeploymentMetrics> deploymentMetrics) { + this.deploymentMetrics = new ConcurrentHashMap<>(deploymentMetrics); + } + + void put(RegionName region, DeploymentMetrics metrics) { + deploymentMetrics.put(region, metrics); + } + + ClusterDeploymentMetrics subsetOf(BcpGroup group) { + Map<RegionName, DeploymentMetrics> filteredMetrics = new HashMap<>(); + for (var entry : deploymentMetrics.entrySet()) { + if (group.contains(entry.getKey())) + filteredMetrics.put(entry.getKey(), entry.getValue()); + } + return new ClusterDeploymentMetrics(filteredMetrics); + } + + /** Returns whether this has deployment metrics for each of the deployments in the given instance. */ + boolean isCompleteExcluding(RegionName regionToExclude, BcpGroup bcpGroup) { + return regionsExcluding(regionToExclude, bcpGroup).allMatch(region -> deploymentMetrics.containsKey(region)); + } + + /** Returns the metrics with the max query rate among the given instance, if any. */ + Optional<DeploymentMetrics> maxQueryRateExcluding(RegionName regionToExclude, BcpGroup bcpGroup) { + return regionsExcluding(regionToExclude, bcpGroup) + .map(region -> deploymentMetrics.get(region)) + .max(Comparator.comparingDouble(m -> m.queryRate)); + } + + private Stream<RegionName> regionsExcluding(RegionName regionToExclude, BcpGroup bcpGroup) { + return bcpGroup.regions().stream() + .filter(region -> ! region.equals(regionToExclude)); + } + + } + + /** Metrics for a given application, cluster and deployment. */ + record DeploymentMetrics(double queryRate, double growthRateHeadroom, double cpuCostPerQuery) { + + public ApplicationPatch.ClusterPatch asClusterPatch() { + return new ApplicationPatch.ClusterPatch(new ApplicationPatch.BcpGroupInfo(queryRate, growthRateHeadroom, cpuCostPerQuery)); + } + + DeploymentMetrics dividedBy(double d) { + return new DeploymentMetrics(queryRate / d, growthRateHeadroom / d, cpuCostPerQuery / d); + } + + DeploymentMetrics multipliedBy(double m) { + return new DeploymentMetrics(queryRate * m, growthRateHeadroom * m, cpuCostPerQuery * m); + } + + DeploymentMetrics add(DeploymentMetrics other) { + return new DeploymentMetrics(queryRate + other.queryRate, + growthRateHeadroom + other.growthRateHeadroom, + cpuCostPerQuery + other.cpuCostPerQuery); + } + + public static DeploymentMetrics empty() { return new DeploymentMetrics(0, 0, 0); } + + } + } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java index 241f2a83d6f..b64de5d5af4 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java @@ -67,7 +67,7 @@ public class ControllerMaintenance extends AbstractComponent { maintainers.add(new HostInfoUpdater(controller, intervals.hostInfoUpdater)); maintainers.add(new ReindexingTriggerer(controller, intervals.reindexingTriggerer)); maintainers.add(new EndpointCertificateMaintainer(controller, intervals.endpointCertificateMaintainer)); - maintainers.add(new TrafficShareUpdater(controller, intervals.trafficFractionUpdater)); + maintainers.add(new BcpGroupUpdater(controller, intervals.trafficFractionUpdater)); maintainers.add(new ArchiveUriUpdater(controller, intervals.archiveUriUpdater)); maintainers.add(new ArchiveAccessMaintainer(controller, metric, intervals.archiveAccessMaintainer)); maintainers.add(new TenantRoleMaintainer(controller, intervals.tenantRoleMaintainer)); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java index 322c78aa7c1..fa917d2eb4e 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java @@ -25,8 +25,8 @@ import java.util.logging.Level; import java.util.logging.Logger; /** - * Retrieves deployment metrics such as QPS and document count from the metric service and - * updates applications with this info. + * Retrieves deployment metrics such as QPS and document count over the config server API + * and updates application objects in the controller with this info. * * @author smorgrav * @author mpolden |