Don't make small changes, don't autoscale when in flux

author: Jon Bratseth <bratseth@verizonmedia.com> 2020-02-18 17:25:37 +0100
committer: Jon Bratseth <bratseth@verizonmedia.com> 2020-02-18 17:25:37 +0100
commit: c6d5dcff1becb08fb86dcc7ed0f387bf8bd249d0 (patch)
tree: 4f14dab93be1a75e9191d219bac74107eadcfe7a
parent: 1a81ebfa954d0450063c3fb4e891950ed95a1a5f (diff)
3 files changed, 113 insertions, 36 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
index 721df4c322f..31783ca0059 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
@@ -21,8 +21,24 @@ import java.util.Optional;
  */
 public class Autoscaler {
 
+    /*
+     TODO:
+     - X Don't always go for more, smaller nodes
+     - X Test gc
+     - X Test AutoscalingMaintainer
+     - X Implement node metrics fetch
+     - X Avoid making decisions for the same app at multiple config servers
+     - X Multiple groups
+     - Have a better idea about whether we have sufficient information to make decisions
+     - Consider taking spikes/variance into account
+     - Measure observed regulation lag (startup+redistribution) into account when deciding regulation observation window
+     */
+
     private static final int minimumMeasurements = 500; // TODO: Per node instead? Also say something about interval?
 
+    /** Only change if the difference between the current and best ratio is larger than this */
+    private static final double resourceDifferenceRatioWorthReallocation = 0.1;
+
     // We only depend on the ratios between these values
     private static final double cpuUnitCost = 12.0;
     private static final double memoryUnitCost = 1.2;
@@ -41,27 +57,57 @@ public class Autoscaler {
     }
 
     public Optional<ClusterResources> autoscale(ApplicationId applicationId, ClusterSpec cluster, List<Node> clusterNodes) {
+        if (clusterNodes.stream().anyMatch(node -> node.status().wantToRetire() ||
+                                                   node.allocation().get().membership().retired() ||
+                                                   node.allocation().get().isRemovable()))
+            return Optional.empty(); // Don't autoscale clusters that are in flux
+
+        ClusterResources currentAllocation = new ClusterResources(clusterNodes.size(),
+                                                                  clusterNodes.get(0).flavor().resources());
         Optional<Double> totalCpuSpent    = averageUseOf(Resource.cpu,    applicationId, cluster, clusterNodes);
         Optional<Double> totalMemorySpent = averageUseOf(Resource.memory, applicationId, cluster, clusterNodes);
         Optional<Double> totalDiskSpent   = averageUseOf(Resource.disk,   applicationId, cluster, clusterNodes);
         if (totalCpuSpent.isEmpty() || totalMemorySpent.isEmpty() || totalDiskSpent.isEmpty()) return Optional.empty();
 
-        Optional<ClusterResources> bestTarget = Optional.empty();
+        Optional<ClusterResources> bestAllocation = findBestAllocation(totalCpuSpent.get(),
+                                                                       totalMemorySpent.get(),
+                                                                       totalDiskSpent.get(),
+                                                                       currentAllocation.resources());
+        if (bestAllocation.isPresent() && isSimilar(bestAllocation.get(), currentAllocation))
+            return Optional.empty(); // Avoid small changes
+        return bestAllocation;
+    }
+
+    private Optional<ClusterResources> findBestAllocation(double totalCpuSpent,
+                                                          double totalMemorySpent,
+                                                          double totalDiskSpent,
+                                                          NodeResources currentResources) {
+        Optional<ClusterResources> bestAllocation = Optional.empty();
         // Try all the node counts allowed by the configuration -
         // -1 to translate from true allocated counts to counts allowing for a node to be down
         for (int targetCount = minimumNodesPerCluster - 1; targetCount <= maximumNodesPerCluster - 1; targetCount++ ) {
             // The resources per node we need if we distribute the total spent over targetCount nodes at ideal load:
             NodeResources targetResources = targetResources(targetCount,
-                                                            totalCpuSpent.get(), totalMemorySpent.get(), totalDiskSpent.get(),
-                                                            clusterNodes.get(0).flavor().resources());
-            Optional<ClusterResources> target = toEffectiveResources(targetCount, targetResources);
-            System.out.println("Trying " + targetCount + " nodes: " + targetResources + ", effective: " + target);
-            if (target.isEmpty()) continue;
-
-            if (bestTarget.isEmpty() || target.get().cost() < bestTarget.get().cost())
-                bestTarget = target;
+                                                            totalCpuSpent, totalMemorySpent, totalDiskSpent,
+                                                            currentResources);
+            Optional<ClusterResources> allocation = toEffectiveResources(targetCount, targetResources);
+            if (allocation.isEmpty()) continue;
+
+            if (bestAllocation.isEmpty() || allocation.get().cost() < bestAllocation.get().cost())
+                bestAllocation = allocation;
         }
-        return bestTarget;
+        return bestAllocation;
+    }
+
+    private boolean isSimilar(ClusterResources a1, ClusterResources a2) {
+        if (a1.count() != a2.count()) return false; // A full node is always a significant difference
+        return isSimilar(a1.resources().vcpu(), a2.resources().vcpu()) &&
+               isSimilar(a1.resources().memoryGb(), a2.resources().memoryGb()) &&
+               isSimilar(a1.resources().diskGb(), a2.resources().diskGb());
+    }
+
+    private boolean isSimilar(double r1, double r2) {
+        return Math.abs(r1 - r2) / r1 < resourceDifferenceRatioWorthReallocation;
     }
 
     /**
@@ -84,9 +130,8 @@ public class Autoscaler {
     private Optional<NodeResources> toEffectiveResources(NodeResources nodeResources) {
         if (allowsHostSharing(nodeRepository.zone().cloud())) {
             // Return the requested resources, or empty if they cannot fit on existing hosts
-            for (Flavor flavor : nodeRepository.getAvailableFlavors().getFlavors()) {
+            for (Flavor flavor : nodeRepository.getAvailableFlavors().getFlavors())
                 if (flavor.resources().satisfies(nodeResources)) return Optional.of(nodeResources);
-            }
             return Optional.empty();
         }
         else {
@@ -128,7 +173,6 @@ public class Autoscaler {
 
         if (window.measurementCount() < minimumMeasurements) return Optional.empty();
         if (window.hostnames() != clusterNodes.size()) return Optional.empty(); // Regulate only when all nodes are measured
-        // TODO: Bail also if allocations have changed in the time window
 
         return Optional.of(window.average() * resource.valueFrom(currentResources) * clusterNodes.size());
     }
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
index 20b9401df9d..5ecb8794bc6 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
@@ -24,23 +24,30 @@ public class AutoscalingTest {
         // deploy
         tester.deploy(application1, cluster1, 5, resources);
 
-        assertTrue("No metrics -> No change", tester.autoscale(application1, cluster1).isEmpty());
+        assertTrue("No measurements -> No change", tester.autoscale(application1, cluster1).isEmpty());
 
-        tester.addMeasurements( 0.3f, 60, application1);
-        assertTrue("Too few metrics -> No change", tester.autoscale(application1, cluster1).isEmpty());
+        tester.addMeasurements( 0.25f, 60, application1);
+        assertTrue("Too few measurements -> No change", tester.autoscale(application1, cluster1).isEmpty());
 
-        tester.addMeasurements( 0.3f, 60, application1);
-        tester.assertResources("Scaling up since resource usage is too high",
-                               10, 2.5, 23.8, 23.8,
-                               tester.autoscale(application1, cluster1));
+        tester.addMeasurements( 0.25f, 60, application1);
+        ClusterResources scaledResources = tester.assertResources("Scaling up since resource usage is too high",
+                                                                 10, 1.7,  44.4, 44.4,
+                                                                  tester.autoscale(application1, cluster1));
 
-        tester.assertResources("No new info -> Same result",
-                               10, 2.5, 23.8, 23.8,
-                               tester.autoscale(application1, cluster1));
+        tester.deploy(application1, cluster1, scaledResources);
+        assertTrue("Cluster in flux -> No further change", tester.autoscale(application1, cluster1).isEmpty());
+
+        tester.deactivateRetired(application1, cluster1, scaledResources);
+        tester.addMeasurements( 0.8f, 3, application1);
+        assertTrue("Load change is large, but insufficient measurements for new config -> No change",
+                   tester.autoscale(application1, cluster1).isEmpty());
+
+        tester.addMeasurements( 0.19f, 100, application1);
+        assertTrue("Load change is small -> No change", tester.autoscale(application1, cluster1).isEmpty());
 
         tester.addMeasurements( 0.1f, 120, application1);
-        tester.assertResources("Scale down since resource usage has gone down",
-                               10, 1.7, 15.9, 15.9,
+        tester.assertResources("Scaling down since resource usage has gone down significantly",
+                               10, 1.2, 44.4, 44.4,
                                tester.autoscale(application1, cluster1));
     }
 
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
index 646f20d0528..ba61b9e579a 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
@@ -8,10 +8,12 @@ import com.yahoo.config.provision.ClusterSpec;
 import com.yahoo.config.provision.Environment;
 import com.yahoo.config.provision.HostSpec;
 import com.yahoo.config.provision.NodeResources;
+import com.yahoo.config.provision.NodeType;
 import com.yahoo.config.provision.RegionName;
 import com.yahoo.config.provision.Zone;
 import com.yahoo.config.provisioning.FlavorsConfig;
 import com.yahoo.test.ManualClock;
+import com.yahoo.transaction.Mutex;
 import com.yahoo.vespa.hosted.provision.Node;
 import com.yahoo.vespa.hosted.provision.NodeRepository;
 import com.yahoo.vespa.hosted.provision.provisioning.ProvisioningTester;
@@ -29,12 +31,12 @@ class AutoscalingTester {
     private final Autoscaler autoscaler;
     private final NodeMetricsDb db;
 
-    public AutoscalingTester(NodeResources... resources) {
+    public AutoscalingTester(NodeResources hostResources) {
         provisioningTester = new ProvisioningTester.Builder().zone(new Zone(Environment.prod, RegionName.from("us-east")))
-                                                             .flavorsConfig(asConfig(resources))
+                                                             .flavorsConfig(asConfig(hostResources))
                                                              .build();
-        for (NodeResources nodeResources : resources)
-            provisioningTester.makeReadyNodes(20, nodeResources);
+        provisioningTester.makeReadyNodes(20, "flavor0", NodeType.host, 8);
+        provisioningTester.deployZoneApp();
 
         db = new NodeMetricsDb();
         autoscaler = new Autoscaler(db, nodeRepository());
@@ -51,35 +53,59 @@ class AutoscalingTester {
                                    false);
     }
 
+    public void deploy(ApplicationId application, ClusterSpec cluster, ClusterResources resources) {
+        deploy(application, cluster, resources.count(), resources.resources());
+    }
+
     public void deploy(ApplicationId application, ClusterSpec cluster, int count, NodeResources resources) {
         List<HostSpec> hosts = provisioningTester.prepare(application, cluster, Capacity.fromCount(count, resources), 1);
         provisioningTester.activate(application, hosts);
+
+    }
+
+    public void deactivateRetired(ApplicationId application, ClusterSpec cluster, ClusterResources resources) {
+        try (Mutex lock = nodeRepository().lock(application)){
+            for (Node node : nodeRepository().getNodes(application, Node.State.active)) {
+                if (node.allocation().get().membership().retired())
+                    nodeRepository().write(node.with(node.allocation().get().removable()), lock);
+            }
+        }
+        deploy(application, cluster, resources);
     }
 
-    public void addMeasurements(float value, int count, ApplicationId applicationId) {
-        List<Node> nodes = nodeRepository().getNodes(applicationId);
+    /**
+     * Adds measurements with the given cpu value and ideal values for the other resources,
+     * scaled to take one node redundancy into account.
+     * (I.e we adjust to measure a bit lower load than "naively" wanted to offset for the autoscaler
+     * wanting to see the ideal load with one node missing.)
+     */
+    public void addMeasurements(float cpuValue, int count, ApplicationId applicationId) {
+        List<Node> nodes = nodeRepository().getNodes(applicationId, Node.State.active);
+        float oneExtraNodeFactor = (float)(nodes.size() - 1.0) / (nodes.size());
         for (int i = 0; i < count; i++) {
             clock().advance(Duration.ofMinutes(1));
             for (Node node : nodes) {
                 for (Resource resource : Resource.values())
-                    db.add(node, resource, clock().instant(), value);
+                    db.add(node, resource, clock().instant(),
+                           (resource == Resource.cpu ? cpuValue : (float)resource.idealAverageLoad()) * oneExtraNodeFactor);
             }
         }
     }
 
     public Optional<ClusterResources> autoscale(ApplicationId application, ClusterSpec cluster) {
-        return autoscaler.autoscale(application, cluster, nodeRepository().getNodes(application));
+        return autoscaler.autoscale(application, cluster, nodeRepository().getNodes(application, Node.State.active));
     }
 
-    public void assertResources(String message,
-                                int nodeCount, double approxCpu, double approxMemory, double approxDisk,
-                                Optional<ClusterResources> actualResources) {
+    public ClusterResources assertResources(String message,
+                                            int nodeCount, double approxCpu, double approxMemory, double approxDisk,
+                                            Optional<ClusterResources> actualResources) {
         double delta = 0.0000000001;
         assertTrue(message, actualResources.isPresent());
         assertEquals("Node count " + message, nodeCount, actualResources.get().count());
         assertEquals("Cpu: "    + message, approxCpu,    Math.round(actualResources.get().resources().vcpu()     * 10) / 10.0, delta);
         assertEquals("Memory: " + message, approxMemory, Math.round(actualResources.get().resources().memoryGb() * 10) / 10.0, delta);
         assertEquals("Disk: "   + message, approxDisk,   Math.round(actualResources.get().resources().diskGb()   * 10) / 10.0, delta);
+        return actualResources.get();
     }
 
     public ManualClock clock() {
author	Jon Bratseth <bratseth@verizonmedia.com>	2020-02-18 17:25:37 +0100
committer	Jon Bratseth <bratseth@verizonmedia.com>	2020-02-18 17:25:37 +0100
commit	c6d5dcff1becb08fb86dcc7ed0f387bf8bd249d0 (patch)
tree	4f14dab93be1a75e9191d219bac74107eadcfe7a
parent	1a81ebfa954d0450063c3fb4e891950ed95a1a5f (diff)