diff options
author | Jon Bratseth <bratseth@gmail.com> | 2021-03-11 18:40:06 +0100 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2021-03-11 18:40:06 +0100 |
commit | d3c80d38bab48b77b64cb5529e08136fe796aca6 (patch) | |
tree | a00b3199541d8ae474d0be4c970217d4b4900435 | |
parent | cb3f517a6d0e6aeb4433552cbb9d2c0b6c3c935c (diff) |
Expose more cluster data in nodes/v2
9 files changed, 79 insertions, 38 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java index 5d5c6fdac5a..c7549a5ddee 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java @@ -80,8 +80,13 @@ public class Autoscaler { "Have measurements from " + nodesMeasured + " nodes, but require from " + clusterNodes.size()); + var scalingDuration = cluster.scalingDuration(clusterNodes.clusterSpec()); var clusterTimeseries = metricsDb.getClusterTimeseries(application.id(), cluster.id()); - var target = ResourceTarget.idealLoad(clusterTimeseries, clusterNodesTimeseries, currentAllocation, application); + var target = ResourceTarget.idealLoad(scalingDuration, + clusterTimeseries, + clusterNodesTimeseries, + currentAllocation, + application); Optional<AllocatableClusterResources> bestAllocation = allocationOptimizer.findBestAllocation(target, currentAllocation, limits); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java index 173d76e4c26..2d0e77742ec 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java @@ -16,14 +16,12 @@ import java.util.stream.Collectors; */ public class ClusterNodesTimeseries { - private final Cluster cluster; private final NodeList clusterNodes; /** The measurements for all nodes in this snapshot */ private final List<NodeTimeseries> timeseries; public ClusterNodesTimeseries(Duration period, Cluster cluster, NodeList clusterNodes, MetricsDb db) { - this.cluster = cluster; this.clusterNodes = clusterNodes; var timeseries = db.getNodeTimeseries(period, clusterNodes); @@ -35,12 +33,6 @@ public class ClusterNodesTimeseries { this.timeseries = timeseries; } - /** The cluster this is a timeseries for */ - public Cluster cluster() { return cluster; } - - /** The nodes of the cluster this is a timeseries for */ - public NodeList clusterNodes() { return clusterNodes; } - /** Returns the average number of measurements per node */ public int measurementsPerNode() { int measurementCount = timeseries.stream().mapToInt(m -> m.size()).sum(); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java index 5b6ed43b713..a435814c21e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java @@ -23,6 +23,8 @@ public class ClusterTimeseries { private final ClusterSpec.Id cluster; private final List<ClusterMetricSnapshot> snapshots; + private Double cachedMaxQueryGrowthRate = null; + ClusterTimeseries(ClusterSpec.Id cluster, List<ClusterMetricSnapshot> snapshots) { this.cluster = cluster; List<ClusterMetricSnapshot> sortedSnapshots = new ArrayList<>(snapshots); @@ -48,6 +50,12 @@ public class ClusterTimeseries { /** The max query growth rate we can predict from this time-series as a fraction of the current traffic per minute */ public double maxQueryGrowthRate() { + if (cachedMaxQueryGrowthRate != null) + return cachedMaxQueryGrowthRate; + return cachedMaxQueryGrowthRate = computeMaxQueryGrowthRate(); + } + + private double computeMaxQueryGrowthRate() { if (snapshots.isEmpty()) return 0.1; // Find the period having the highest growth rate, where total growth exceeds 30% increase diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java index f29181b8343..ab6a6d548e9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java @@ -49,12 +49,13 @@ public class ResourceTarget { } /** Create a target of achieving ideal load given a current load */ - public static ResourceTarget idealLoad(ClusterTimeseries clusterTimeseries, + public static ResourceTarget idealLoad(Duration scalingDuration, + ClusterTimeseries clusterTimeseries, ClusterNodesTimeseries clusterNodesTimeseries, AllocatableClusterResources current, Application application) { return new ResourceTarget(nodeUsage(Resource.cpu, clusterNodesTimeseries.averageLoad(Resource.cpu), current) - / idealCpuLoad(clusterTimeseries, clusterNodesTimeseries, application), + / idealCpuLoad(scalingDuration, clusterTimeseries, application), nodeUsage(Resource.memory, clusterNodesTimeseries.averageLoad(Resource.memory), current) / Resource.memory.idealAverageLoad(), nodeUsage(Resource.disk, clusterNodesTimeseries.averageLoad(Resource.disk), current) @@ -71,12 +72,11 @@ public class ResourceTarget { } /** Ideal cpu load must take the application traffic fraction into account */ - private static double idealCpuLoad(ClusterTimeseries clusterTimeseries, - ClusterNodesTimeseries clusterNodesTimeseries, - Application application) { + public static double idealCpuLoad(Duration scalingDuration, + ClusterTimeseries clusterTimeseries, + Application application) { // What's needed to have headroom for growth during scale-up as a fraction of current resources? double maxGrowthRate = clusterTimeseries.maxQueryGrowthRate(); // in fraction per minute of the current traffic - Duration scalingDuration = clusterNodesTimeseries.cluster().scalingDuration(clusterNodesTimeseries.clusterNodes().clusterSpec()); double growthRateHeadroom = 1 + maxGrowthRate * scalingDuration.toMinutes(); // Cap headroom at 10% above the historical observed peak double fractionOfMax = clusterTimeseries.currentQueryFractionOfMax(); @@ -96,4 +96,8 @@ public class ResourceTarget { return 1 / growthRateHeadroom * 1 / trafficShiftHeadroom * Resource.cpu.idealAverageLoad(); } + public static double idealMemoryLoad() { return Resource.memory.idealAverageLoad(); } + + public static double idealDiskLoad() { return Resource.disk.idealAverageLoad(); } + } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java index 79f3dad75d3..0307ae13b24 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java @@ -163,7 +163,7 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { ApplicationId application = nodeToRetire.get().allocation().get().owner(); try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository())) { - if ( ! deployment.isValid()) return; // this will be done at another config server + if ( ! deployment.isValid()) return; Optional<Node> nodeWithWantToRetire = nodeRepository().nodes().node(nodeToRetire.get().hostname()) .map(node -> node.withWantToRetire(true, Agent.SpareCapacityMaintainer, nodeRepository().clock().instant())); @@ -171,7 +171,7 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { nodeRepository().nodes().write(nodeWithWantToRetire.get(), deployment.applicationLock().get()); log.log(Level.INFO, String.format("Redeploying %s to move %s from overcommitted host", - application, nodeToRetire.get().hostname())); + application, nodeToRetire.get().hostname())); deployment.activate(); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java index 4235bae6850..4d1c963d8ea 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java @@ -1,6 +1,7 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.restapi; +import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.ClusterResources; import com.yahoo.slime.Cursor; import com.yahoo.slime.Slime; @@ -10,8 +11,10 @@ import com.yahoo.vespa.hosted.provision.applications.Application; import com.yahoo.vespa.hosted.provision.applications.Cluster; import com.yahoo.vespa.hosted.provision.applications.ScalingEvent; import com.yahoo.vespa.hosted.provision.autoscale.ClusterNodesTimeseries; +import com.yahoo.vespa.hosted.provision.autoscale.ClusterTimeseries; import com.yahoo.vespa.hosted.provision.autoscale.MetricsDb; import com.yahoo.vespa.hosted.provision.autoscale.Resource; +import com.yahoo.vespa.hosted.provision.autoscale.ResourceTarget; import java.net.URI; import java.time.Duration; @@ -20,50 +23,60 @@ import java.util.List; /** * Serializes application information for nodes/v2/application responses + * + * @author bratseth */ public class ApplicationSerializer { - public static Slime toSlime(Application application, List<Node> applicationNodes, MetricsDb metricsDb, URI applicationUri) { + public static Slime toSlime(Application application, NodeList applicationNodes, MetricsDb metricsDb, URI applicationUri) { Slime slime = new Slime(); toSlime(application, applicationNodes, metricsDb, slime.setObject(), applicationUri); return slime; } private static void toSlime(Application application, - List<Node> applicationNodes, + NodeList applicationNodes, MetricsDb metricsDb, Cursor object, URI applicationUri) { object.setString("url", applicationUri.toString()); object.setString("id", application.id().toFullString()); - clustersToSlime(application.clusters().values(), applicationNodes, metricsDb, object.setObject("clusters")); + clustersToSlime(application, applicationNodes, metricsDb, object.setObject("clusters")); } - private static void clustersToSlime(Collection<Cluster> clusters, - List<Node> applicationNodes, + private static void clustersToSlime(Application application, + NodeList applicationNodes, MetricsDb metricsDb, Cursor clustersObject) { - clusters.forEach(cluster -> toSlime(cluster, applicationNodes, metricsDb, clustersObject)); + application.clusters().values().forEach(cluster -> toSlime(application, cluster, applicationNodes, metricsDb, clustersObject)); } - private static void toSlime(Cluster cluster, - List<Node> applicationNodes, + private static void toSlime(Application application, + Cluster cluster, + NodeList applicationNodes, MetricsDb metricsDb, Cursor clustersObject) { - NodeList nodes = NodeList.copyOf(applicationNodes).not().retired().cluster(cluster.id()); + NodeList nodes = applicationNodes.not().retired().cluster(cluster.id()); if (nodes.isEmpty()) return; ClusterResources currentResources = nodes.toResources(); + Duration scalingDuration = cluster.scalingDuration(nodes.clusterSpec()); + var clusterNodesTimeseries = new ClusterNodesTimeseries(Duration.ofHours(1), cluster, nodes, metricsDb); + var clusterTimeseries = metricsDb.getClusterTimeseries(application.id(), cluster.id()); Cursor clusterObject = clustersObject.setObject(cluster.id().value()); + clusterObject.setString("type", nodes.clusterSpec().type().name()); toSlime(cluster.minResources(), clusterObject.setObject("min")); toSlime(cluster.maxResources(), clusterObject.setObject("max")); toSlime(currentResources, clusterObject.setObject("current")); if (cluster.shouldSuggestResources(currentResources)) cluster.suggestedResources().ifPresent(suggested -> toSlime(suggested.resources(), clusterObject.setObject("suggested"))); cluster.targetResources().ifPresent(target -> toSlime(target, clusterObject.setObject("target"))); - clusterUtilizationToSlime(cluster, NodeList.copyOf(applicationNodes), metricsDb, clusterObject.setObject("utilization")); + clusterUtilizationToSlime(application, scalingDuration, clusterTimeseries, clusterNodesTimeseries, clusterObject.setObject("utilization")); scalingEventsToSlime(cluster.scalingEvents(), clusterObject.setArray("scalingEvents")); clusterObject.setString("autoscalingStatus", cluster.autoscalingStatus()); + clusterObject.setLong("scalingDuration", scalingDuration.toMillis()); + clusterObject.setDouble("maxQueryGrowthRate", clusterTimeseries.maxQueryGrowthRate()); + clusterObject.setDouble("currentQueryFractionOfMax", clusterTimeseries.currentQueryFractionOfMax()); } private static void toSlime(ClusterResources resources, Cursor clusterResourcesObject) { @@ -72,12 +85,17 @@ public class ApplicationSerializer { NodeResourcesSerializer.toSlime(resources.nodeResources(), clusterResourcesObject.setObject("resources")); } - private static void clusterUtilizationToSlime(Cluster cluster, NodeList nodes, MetricsDb metricsDb, Cursor utilizationObject) { - var timeseries = new ClusterNodesTimeseries(Duration.ofHours(1), cluster, nodes, metricsDb); - - utilizationObject.setDouble("cpu", timeseries.averageLoad(Resource.cpu)); - utilizationObject.setDouble("memory", timeseries.averageLoad(Resource.memory)); - utilizationObject.setDouble("disk", timeseries.averageLoad(Resource.disk)); + private static void clusterUtilizationToSlime(Application application, + Duration scalingDuration, + ClusterTimeseries clusterTimeseries, + ClusterNodesTimeseries clusterNodesTimeseries, + Cursor utilizationObject) { + utilizationObject.setDouble("cpu", clusterNodesTimeseries.averageLoad(Resource.cpu)); + utilizationObject.setDouble("idealCpu", ResourceTarget.idealCpuLoad(scalingDuration, clusterTimeseries, application)); + utilizationObject.setDouble("memory", clusterNodesTimeseries.averageLoad(Resource.memory)); + utilizationObject.setDouble("idealMemory", ResourceTarget.idealMemoryLoad()); + utilizationObject.setDouble("disk", clusterNodesTimeseries.averageLoad(Resource.disk)); + utilizationObject.setDouble("idealDisk", ResourceTarget.idealDiskLoad()); } private static void scalingEventsToSlime(List<ScalingEvent> scalingEvents, Cursor scalingEventsArray) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java index 62c7f40f7da..2442ff9d565 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java @@ -445,7 +445,7 @@ public class NodesV2ApiHandler extends LoggingRequestHandler { if (application.isEmpty()) return ErrorResponse.notFoundError("No application '" + id + "'"); Slime slime = ApplicationSerializer.toSlime(application.get(), - nodeRepository.nodes().list(Node.State.active).owner(id).asList(), + nodeRepository.nodes().list(Node.State.active).owner(id), metricsDb, withPath("/nodes/v2/applications/" + id, uri)); return new SlimeJsonResponse(slime); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json index a35c742bc2a..1e9a2d60837 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json @@ -3,6 +3,7 @@ "id" : "tenant1.application1.instance1", "clusters" : { "id1" : { + "type": "container", "min" : { "nodes" : 2, "groups" : 1, @@ -65,8 +66,11 @@ }, "utilization" : { "cpu" : 0.0, + "idealCpu": 0.2, "memory" : 0.0, - "disk" : 0.0 + "idealMemory": 0.7, + "disk" : 0.0, + "idealDisk": 0.6 }, "scalingEvents" : [ { @@ -97,7 +101,10 @@ "at" : 123 } ], - "autoscalingStatus" : "" + "autoscalingStatus": "", + "scalingDuration": 600000, + "maxQueryGrowthRate": 0.1, + "currentQueryFractionOfMax": 0.5 } } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application2.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application2.json index baf2528c74a..376b748ff8e 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application2.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application2.json @@ -3,6 +3,7 @@ "id": "tenant2.application2.instance2", "clusters": { "id2": { + "type": "content", "min": { "nodes": 2, "groups": 1, @@ -41,8 +42,11 @@ }, "utilization" : { "cpu" : 0.0, + "idealCpu": 0.19047619047619047, "memory" : 0.0, - "disk" : 0.0 + "idealMemory": 0.7, + "disk" : 0.0, + "idealDisk": 0.6 }, "scalingEvents" : [ { @@ -73,7 +77,10 @@ "at" : 123 } ], - "autoscalingStatus" : "" + "autoscalingStatus" : "", + "scalingDuration": 43200000, + "maxQueryGrowthRate": 0.1, + "currentQueryFractionOfMax": 0.5 } } } |