diff options
author | Bjørn Meland <bjormel@users.noreply.github.com> | 2024-05-16 14:22:09 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-16 14:22:09 +0000 |
commit | 6f7f6c7b17f8f952ac9adcc13310e3182ba88cef (patch) | |
tree | e5a544a0730a50096d60466ae22b61a51163a93a | |
parent | 468e9fbe9ae106328493f073b1bc8b009591900b (diff) | |
parent | cc2703290f229d6b225f717e2b38747b4e32f953 (diff) |
Merge pull request #31229 from vespa-engine/bjormel/autoscaling-logging
Enable detailed Autoscaling logging with PermanentFlag
6 files changed, 39 insertions, 10 deletions
diff --git a/flags/src/main/java/com/yahoo/vespa/flags/PermanentFlags.java b/flags/src/main/java/com/yahoo/vespa/flags/PermanentFlags.java index 2a667930add..ec92188a029 100644 --- a/flags/src/main/java/com/yahoo/vespa/flags/PermanentFlags.java +++ b/flags/src/main/java/com/yahoo/vespa/flags/PermanentFlags.java @@ -404,6 +404,12 @@ public class PermanentFlags { "Takes effect immediately", INSTANCE_ID); + public static final UnboundBooleanFlag AUTOSCALING_DETAILED_LOGGING = defineFeatureFlag( + "autoscaling-detailed-logging", true, + "Whether to log autoscaling decision data", + "Takes effect immediately", + INSTANCE_ID); + public static final UnboundIntFlag MAX_HOSTS_PER_HOUR = defineIntFlag( "max-hosts-per-hour", 40, "The number of hosts that can be provisioned per hour in a zone, before throttling is " + diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java index 61d4ced1367..5a790a1fe19 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java @@ -19,6 +19,7 @@ import static com.yahoo.vespa.hosted.provision.autoscale.Autoscaler.headroomRequ * @author bratseth */ public class AllocationOptimizer { + private static final java.util.logging.Logger log = java.util.logging.Logger.getLogger(AllocationOptimizer.class.getName()); // The min and max nodes to consider when not using application supplied limits private static final int minimumNodes = 2; // Since this number includes redundancy it cannot be lower than 2 @@ -37,8 +38,8 @@ public class AllocationOptimizer { * @return the best allocation, if there are any possible legal allocations, fulfilling the target * fully or partially, within the limits */ - public Optional<AllocatableResources> findBestAllocation(Load loadAdjustment, ClusterModel model, Limits limits) { - return findBestAllocations(loadAdjustment, model, limits).stream().findFirst(); + public Optional<AllocatableResources> findBestAllocation(Load loadAdjustment, ClusterModel model, Limits limits, boolean enableDetailedLogging) { + return findBestAllocations(loadAdjustment, model, limits, enableDetailedLogging).stream().findFirst(); } /** @@ -48,7 +49,7 @@ public class AllocationOptimizer { * @return the best allocations, if there are any possible legal allocations, fulfilling the target * fully or partially, within the limits. The list contains the three best allocations, sorted from most to least preferred. */ - public List<AllocatableResources> findBestAllocations(Load loadAdjustment, ClusterModel model, Limits limits) { + public List<AllocatableResources> findBestAllocations(Load loadAdjustment, ClusterModel model, Limits limits, boolean enableDetailedLogging) { if (limits.isEmpty()) limits = Limits.of(new ClusterResources(minimumNodes, 1, NodeResources.unspecified()), new ClusterResources(maximumNodes, maximumNodes, NodeResources.unspecified()), @@ -78,8 +79,15 @@ public class AllocationOptimizer { nodeRepository); if (allocatableResources.isEmpty()) continue; bestAllocations.add(allocatableResources.get()); + if (enableDetailedLogging) { + log.info("Adding allocatableResources to list for " + model.application().id() + " in " + model.current().clusterSpec().id() + ": " + + "\n\t" + allocatableResources.get().toString()); + } } } + if (enableDetailedLogging) { + log.info("Found " + bestAllocations.size() + " legal allocations for " + model.application().id() + " in " + model.current().clusterSpec().id()); + } return bestAllocations.stream() .sorted((one, other) -> { if (one.preferableTo(other, model)) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java index 40819e709de..29ab6d65b9f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java @@ -17,7 +17,7 @@ import java.util.List; * @author bratseth */ public class Autoscaler { - + private static final java.util.logging.Logger log = java.util.logging.Logger.getLogger(Autoscaler.class.getName()); /** What cost difference is worth a reallocation? */ private static final double costDifferenceWorthReallocation = 0.1; /** What resource difference is worth a reallocation? */ @@ -44,7 +44,7 @@ public class Autoscaler { var model = model(application, cluster, clusterNodes); if (model.isEmpty() || ! model.isStable(nodeRepository)) return List.of(); - var targets = allocationOptimizer.findBestAllocations(model.loadAdjustment(), model, Limits.empty()); + var targets = allocationOptimizer.findBestAllocations(model.loadAdjustment(), model, Limits.empty(), false); return targets.stream() .map(target -> toAutoscaling(target, model)) .toList(); @@ -54,9 +54,10 @@ public class Autoscaler { * Autoscale a cluster by load. This returns a better allocation (if found) inside the min and max limits. * * @param clusterNodes the list of all the active nodes in a cluster + * @param enableDetailedLogging Whether to log autoscaling decision data * @return scaling advice for this cluster */ - public Autoscaling autoscale(Application application, Cluster cluster, NodeList clusterNodes) { + public Autoscaling autoscale(Application application, Cluster cluster, NodeList clusterNodes, boolean enableDetailedLogging) { var limits = Limits.of(cluster); var model = model(application, cluster, clusterNodes); if (model.isEmpty()) return Autoscaling.empty(); @@ -68,9 +69,12 @@ public class Autoscaler { return Autoscaling.dontScale(Status.waiting, "Cluster change in progress", model); var loadAdjustment = model.loadAdjustment(); + if (enableDetailedLogging) { + log.info("Application: " + application.id().toShortString() + ", loadAdjustment: " + loadAdjustment.toString()); + } // Ensure we only scale down if we'll have enough headroom to not scale up again given a small load increase - var target = allocationOptimizer.findBestAllocation(loadAdjustment, model, limits); + var target = allocationOptimizer.findBestAllocation(loadAdjustment, model, limits, enableDetailedLogging); if (target.isEmpty()) return Autoscaling.dontScale(Status.insufficient, "No allocations are possible within configured limits", model); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java index 2bec9aa6115..5c9c5fe30d7 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java @@ -39,6 +39,7 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer { private final Deployer deployer; private final Metric metric; private final BooleanFlag enabledFlag; + private final BooleanFlag enableDetailedLoggingFlag; public AutoscalingMaintainer(NodeRepository nodeRepository, Deployer deployer, @@ -49,6 +50,7 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer { this.deployer = deployer; this.metric = metric; this.enabledFlag = PermanentFlags.AUTOSCALING.bindTo(nodeRepository.flagSource()); + this.enableDetailedLoggingFlag = PermanentFlags.AUTOSCALING_DETAILED_LOGGING.bindTo(nodeRepository.flagSource()); } @Override @@ -80,6 +82,7 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer { */ private boolean autoscale(ApplicationId applicationId, ClusterSpec.Id clusterId) { boolean redeploy = false; + boolean enableDetailedLogging = enableDetailedLoggingFlag.with(Dimension.INSTANCE_ID, applicationId.serializedForm()).value(); try (var lock = nodeRepository().applications().lock(applicationId)) { Optional<Application> application = nodeRepository().applications().get(applicationId); if (application.isEmpty()) return true; @@ -95,7 +98,7 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer { // Autoscale unless an autoscaling is already in progress Autoscaling autoscaling = null; if (cluster.target().resources().isEmpty() && !cluster.scalingInProgress()) { - autoscaling = autoscaler.autoscale(application.get(), cluster, clusterNodes); + autoscaling = autoscaler.autoscale(application.get(), cluster, clusterNodes, enableDetailedLogging); if (autoscaling.isPresent() || cluster.target().isEmpty()) // Ignore empty from recently started servers cluster = cluster.withTarget(autoscaling); } @@ -108,6 +111,14 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer { if (autoscaling != null && autoscaling.resources().isPresent() && !current.equals(autoscaling.resources().get())) { redeploy = true; logAutoscaling(current, autoscaling.resources().get(), applicationId, clusterNodes.not().retired()); + if (enableDetailedLogging) { + log.info("autoscaling data for " + applicationId.toFullString() + ": " + + "\n\tmetrics().cpuCostPerQuery(): " + autoscaling.metrics().cpuCostPerQuery() + + "\n\tmetrics().queryRate(): " + autoscaling.metrics().queryRate() + + "\n\tmetrics().growthRateHeadroom(): " + autoscaling.metrics().growthRateHeadroom() + + "\n\tpeak(): " + autoscaling.peak().toString() + + "\n\tideal(): " + autoscaling.ideal().toString()); + } } } catch (ApplicationLockException e) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java index 7ac80dfbdb3..2588b02d712 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java @@ -209,7 +209,7 @@ public class NodeRepositoryProvisioner implements Provisioner { return model.current().advertisedResources(); // Otherwise, find an allocation that preserves the current resources as well as possible - return allocationOptimizer.findBestAllocation(Load.one(), model, limits) + return allocationOptimizer.findBestAllocation(Load.one(), model, limits, false) .orElseThrow(() -> newNoAllocationPossible(model.current().clusterSpec(), limits)) .advertisedResources(); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicProvisioningTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicProvisioningTester.java index 183ff85da47..401b6d83651 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicProvisioningTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicProvisioningTester.java @@ -165,7 +165,7 @@ public class DynamicProvisioningTester { nodeRepository().applications().put(application, lock); } return autoscaler.autoscale(application, application.clusters().get(cluster.id()), - nodeRepository().nodes().list(Node.State.active).owner(applicationId)); + nodeRepository().nodes().list(Node.State.active).owner(applicationId), false); } public List<Autoscaling> suggest(ApplicationId applicationId, ClusterSpec.Id clusterId, |