summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@oath.com>2020-05-18 19:31:17 +0200
committerGitHub <noreply@github.com>2020-05-18 19:31:17 +0200
commitf0fd6a1ab95632f1d05ae576f0449ea92576150f (patch)
tree241f5d04568092294c4b05e6bffca4e3438714f8 /node-repository
parent93ecbc5037cdcc318055f769365382ff7cdc698f (diff)
parent82c58b07af44182b3094588f8fd1f4e31df7fc55 (diff)
Merge pull request #13284 from vespa-engine/hmusum/reduce-wait-time-after-previous-deployment
Reduce wait time after previous deployment
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java217
2 files changed, 121 insertions, 98 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java
index c7356f33d6b..19c612a46c5 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java
@@ -22,7 +22,7 @@ import java.util.Optional;
* @author bratseth
*/
public class Rebalancer extends NodeRepositoryMaintainer {
- static final Duration waitTimeAfterPreviousDeployment = Duration.ofMinutes(30);
+ static final Duration waitTimeAfterPreviousDeployment = Duration.ofMinutes(10);
private final Deployer deployer;
private final Optional<HostProvisioner> hostProvisioner;
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java
index 9e384dc162b..19dc3699dd9 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java
@@ -12,13 +12,14 @@ import com.yahoo.config.provision.NodeType;
import com.yahoo.config.provision.RegionName;
import com.yahoo.config.provision.Zone;
import com.yahoo.config.provisioning.FlavorsConfig;
+import com.yahoo.test.ManualClock;
import com.yahoo.transaction.NestedTransaction;
import com.yahoo.vespa.hosted.provision.Node;
+import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder;
import com.yahoo.vespa.hosted.provision.provisioning.ProvisioningTester;
import com.yahoo.vespa.hosted.provision.testutils.MockDeployer;
-import org.junit.Before;
import org.junit.Test;
import java.time.Duration;
@@ -27,6 +28,8 @@ import java.util.Map;
import java.util.Optional;
import static com.yahoo.vespa.hosted.provision.maintenance.Rebalancer.waitTimeAfterPreviousDeployment;
+import static com.yahoo.vespa.hosted.provision.maintenance.RebalancerTest.RebalancerTester.cpuApp;
+import static com.yahoo.vespa.hosted.provision.maintenance.RebalancerTest.RebalancerTester.memoryApp;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
@@ -35,80 +38,46 @@ import static org.junit.Assert.assertTrue;
* @author bratseth
*/
public class RebalancerTest {
- ApplicationId cpuApp;
- ApplicationId memApp;
- NodeResources cpuResources;
- NodeResources memResources;
- TestMetric metric = new TestMetric();
- ProvisioningTester tester;
- MockDeployer deployer;
- private Rebalancer rebalancer;
-
- @Before
- public void setup() {
- // --- Setup
- cpuApp = makeApplicationId("t1", "a1");
- memApp = makeApplicationId("t2", "a2");
- cpuResources = new NodeResources(8, 4, 10, 0.1);
- memResources = new NodeResources(4, 9, 10, 0.1);
-
- tester = new ProvisioningTester.Builder().zone(new Zone(Environment.perf, RegionName.from("us-east"))).flavorsConfig(flavorsConfig()).build();
-
- Map<ApplicationId, MockDeployer.ApplicationContext> apps = Map.of(
- cpuApp, new MockDeployer.ApplicationContext(cpuApp, clusterSpec("c"), Capacity.from(new ClusterResources(1, 1, cpuResources))),
- memApp, new MockDeployer.ApplicationContext(memApp, clusterSpec("c"), Capacity.from(new ClusterResources(1, 1, memResources))));
- deployer = new MockDeployer(tester.provisioner(), tester.clock(), apps);
-
- rebalancer = new Rebalancer(deployer,
- tester.nodeRepository(),
- Optional.empty(),
- metric,
- tester.clock(),
- Duration.ofMinutes(1));
-
- tester.makeReadyNodes(3, "flt", NodeType.host, 8);
- tester.deployZoneApp();
- }
@Test
public void testRebalancing() {
+ RebalancerTester tester = new RebalancerTester();
+
// --- Deploying a cpu heavy application - causing 1 of these nodes to be skewed
- deployApp(cpuApp);
- Node cpuSkewedNode = tester.nodeRepository().getNodes(cpuApp).get(0);
+ tester.deployApp(cpuApp);
+ Node cpuSkewedNode = tester.getNode(cpuApp);
- rebalancer.maintain();
+ tester.maintain();
assertFalse("No better place to move the skewed node, so no action is taken",
- tester.nodeRepository().getNode(cpuSkewedNode.hostname()).get().status().wantToRetire());
- assertEquals(0.00325, metric.values.get("hostedVespa.docker.skew").doubleValue(), 0.00001);
+ tester.getNode(cpuSkewedNode.hostname()).get().status().wantToRetire());
+ assertEquals(0.00325, tester.metric().values.get("hostedVespa.docker.skew").doubleValue(), 0.00001);
// --- Making a more suitable node configuration available causes rebalancing
- Node newCpuHost = tester.makeReadyNodes(1, "cpu", NodeType.host, 8).get(0);
+ Node newCpuHost = tester.makeReadyNode("cpu");
tester.deployZoneApp();
- rebalancer.maintain();
- assertTrue("Rebalancer retired the node we wanted to move away from",
- tester.nodeRepository().getNode(cpuSkewedNode.hostname()).get().allocation().get().membership().retired());
+ tester.maintain();
+ assertTrue("Rebalancer retired the node we wanted to move away from", tester.isNodeRetired(cpuSkewedNode));
assertTrue("... and added a node on the new host instead",
- tester.nodeRepository().getNodes(cpuApp, Node.State.active).stream().anyMatch(node -> node.hasParent(newCpuHost.hostname())));
+ tester.getNodes(cpuApp, Node.State.active).stream().anyMatch(node -> node.hasParent(newCpuHost.hostname())));
assertEquals("Skew is reduced",
- 0.00244, metric.values.get("hostedVespa.docker.skew").doubleValue(), 0.00001);
+ 0.00244, tester.metric().values.get("hostedVespa.docker.skew").doubleValue(), 0.00001);
// --- Deploying a mem heavy application - allocated to the best option and causing increased skew
- deployApp(memApp);
- assertEquals("Assigned to a flat node as that causes least skew", "flt",
- tester.nodeRepository().list().parentOf(tester.nodeRepository().getNodes(memApp).get(0)).get().flavor().name());
- rebalancer.maintain();
+ tester.deployApp(memoryApp);
+ assertEquals("Assigned to a flat node as that causes least skew", "flat",
+ tester.nodeRepository().list().parentOf(tester.getNode(memoryApp)).get().flavor().name());
+ tester.maintain();
assertEquals("Deploying the mem skewed app increased skew",
- 0.00734, metric.values.get("hostedVespa.docker.skew").doubleValue(), 0.00001);
+ 0.00734, tester.metric().values.get("hostedVespa.docker.skew").doubleValue(), 0.00001);
// --- Adding a more suitable node reconfiguration causes no action as the system is not stable
- Node memSkewedNode = tester.nodeRepository().getNodes(memApp).get(0);
- Node newMemHost = tester.makeReadyNodes(1, "mem", NodeType.host, 8).get(0);
+ Node memSkewedNode = tester.getNode(memoryApp);
+ Node newMemHost = tester.makeReadyNode("mem");
tester.deployZoneApp();
- rebalancer.maintain();
- assertFalse("No rebalancing happens because cpuSkewedNode is still retired",
- tester.nodeRepository().getNode(memSkewedNode.hostname()).get().allocation().get().membership().retired());
+ tester.maintain();
+ assertFalse("No rebalancing happens because cpuSkewedNode is still retired", tester.isNodeRetired(memSkewedNode));
// --- Making the system stable enables rebalancing
NestedTransaction tx = new NestedTransaction();
@@ -116,73 +85,127 @@ public class RebalancerTest {
tx.commit();
// ... if activation fails when trying, we clean up the state
- deployer.setFailActivate(true);
- rebalancer.maintain();
- assertTrue("Want to retire is reset", tester.nodeRepository().getNodes(Node.State.active).stream().noneMatch(node -> node.status().wantToRetire()));
- assertEquals("Reserved node was moved to dirty", 1, tester.nodeRepository().getNodes(Node.State.dirty).size());
- String reservedHostname = tester.nodeRepository().getNodes(Node.State.dirty).get(0).hostname();
+ tester.deployer().setFailActivate(true);
+ tester.maintain();
+ assertTrue("Want to retire is reset", tester.getNodes(Node.State.active).stream().noneMatch(node -> node.status().wantToRetire()));
+ assertEquals("Reserved node was moved to dirty", 1, tester.getNodes(Node.State.dirty).size());
+ String reservedHostname = tester.getNodes(Node.State.dirty).get(0).hostname();
tester.nodeRepository().setReady(reservedHostname, Agent.system, "Cleanup");
tester.nodeRepository().removeRecursively(reservedHostname);
// ... otherwise we successfully rebalance, again reducing skew
- deployer.setFailActivate(false);
- rebalancer.maintain();
- assertTrue("Rebalancer retired the node we wanted to move away from",
- tester.nodeRepository().getNode(memSkewedNode.hostname()).get().allocation().get().membership().retired());
+ tester.deployer().setFailActivate(false);
+ tester.maintain();
+ assertTrue("Rebalancer retired the node we wanted to move away from", tester.isNodeRetired(memSkewedNode));
assertTrue("... and added a node on the new host instead",
- tester.nodeRepository().getNodes(memApp, Node.State.active).stream().anyMatch(node -> node.hasParent(newMemHost.hostname())));
+ tester.getNodes(memoryApp, Node.State.active).stream().anyMatch(node -> node.hasParent(newMemHost.hostname())));
assertEquals("Skew is reduced",
- 0.00587, metric.values.get("hostedVespa.docker.skew").doubleValue(), 0.00001);
+ 0.00587, tester.metric().values.get("hostedVespa.docker.skew").doubleValue(), 0.00001);
}
@Test
public void testNoRebalancingIfRecentlyDeployed() {
+ RebalancerTester tester = new RebalancerTester();
+
// --- Deploying a cpu heavy application - causing 1 of these nodes to be skewed
- deployApp(cpuApp);
- Node cpuSkewedNode = tester.nodeRepository().getNodes(cpuApp).get(0);
- rebalancer.maintain();
+ tester.deployApp(cpuApp);
+ Node cpuSkewedNode = tester.getNode(cpuApp);
+ tester.maintain();
assertFalse("No better place to move the skewed node, so no action is taken",
- tester.nodeRepository().getNode(cpuSkewedNode.hostname()).get().status().wantToRetire());
+ tester.getNode(cpuSkewedNode.hostname()).get().status().wantToRetire());
// --- Making a more suitable node configuration available causes rebalancing
- Node newCpuHost = tester.makeReadyNodes(1, "cpu", NodeType.host, 8).get(0);
+ Node newCpuHost = tester.makeReadyNode("cpu");
tester.deployZoneApp();
- deployApp(cpuApp, false /* skip advancing clock after deployment */);
- rebalancer.maintain();
- assertFalse("No action, since app was recently deployed",
- tester.nodeRepository().getNode(cpuSkewedNode.hostname()).get().allocation().get().membership().retired());
+ tester.deployApp(cpuApp, false /* skip advancing clock after deployment */);
+ tester.maintain();
+ assertFalse("No action, since app was recently deployed", tester.isNodeRetired(cpuSkewedNode));
tester.clock().advance(waitTimeAfterPreviousDeployment);
- rebalancer.maintain();
- assertTrue("Rebalancer retired the node we wanted to move away from",
- tester.nodeRepository().getNode(cpuSkewedNode.hostname()).get().allocation().get().membership().retired());
+ tester.maintain();
+ assertTrue("Rebalancer retired the node we wanted to move away from", tester.isNodeRetired(cpuSkewedNode));
assertTrue("... and added a node on the new host instead",
- tester.nodeRepository().getNodes(cpuApp, Node.State.active).stream().anyMatch(node -> node.hasParent(newCpuHost.hostname())));
+ tester.getNodes(cpuApp, Node.State.active).stream().anyMatch(node -> node.hasParent(newCpuHost.hostname())));
}
- private ClusterSpec clusterSpec(String clusterId) {
- return ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from(clusterId)).vespaVersion("6.42").build();
- }
- private ApplicationId makeApplicationId(String tenant, String appName) {
- return ApplicationId.from(tenant, appName, "default");
- }
+ static class RebalancerTester {
- private void deployApp(ApplicationId id) { deployApp(id, true); }
+ static ApplicationId cpuApp = makeApplicationId("t1", "a1");
+ static ApplicationId memoryApp = makeApplicationId("t2", "a2");
+ private static NodeResources cpuResources = new NodeResources(8, 4, 10, 0.1);
+ private static NodeResources memResources = new NodeResources(4, 9, 10, 0.1);
+ private TestMetric metric = new TestMetric();
+ private ProvisioningTester tester = new ProvisioningTester.Builder()
+ .zone(new Zone(Environment.perf, RegionName.from("us-east")))
+ .flavorsConfig(flavorsConfig())
+ .build();
+ private final MockDeployer deployer;
+ private final Rebalancer rebalancer;
- private void deployApp(ApplicationId id, boolean advanceClock) {
- deployer.deployFromLocalActive(id).get().activate();
- if (advanceClock) tester.clock().advance(waitTimeAfterPreviousDeployment);
- }
+ RebalancerTester() {
+ Map<ApplicationId, MockDeployer.ApplicationContext> apps = Map.of(
+ cpuApp, new MockDeployer.ApplicationContext(cpuApp, clusterSpec("c"), Capacity.from(new ClusterResources(1, 1, cpuResources))),
+ memoryApp, new MockDeployer.ApplicationContext(memoryApp, clusterSpec("c"), Capacity.from(new ClusterResources(1, 1, memResources))));
+ deployer = new MockDeployer(tester.provisioner(), tester.clock(), apps);
+ rebalancer = new Rebalancer(deployer, tester.nodeRepository(), Optional.empty(), metric, tester.clock(), Duration.ofMinutes(1));
+ tester.makeReadyNodes(3, "flat", NodeType.host, 8);
+ tester.deployZoneApp();
+ }
+
+ void maintain() { rebalancer.maintain(); }
+
+ Node makeReadyNode(String flavor) { return tester.makeReadyNodes(1, flavor, NodeType.host, 8).get(0); }
+
+ TestMetric metric() { return metric; }
+
+ NodeRepository nodeRepository() { return tester.nodeRepository(); }
+
+ void deployZoneApp() { tester.deployZoneApp(); }
+
+ void deployApp(ApplicationId id) { deployApp(id, true); }
+
+ void deployApp(ApplicationId id, boolean advanceClock) {
+ deployer.deployFromLocalActive(id).get().activate();
+ if (advanceClock) tester.clock().advance(waitTimeAfterPreviousDeployment);
+ }
+
+ List<Node> getNodes(ApplicationId applicationId, Node.State nodeState) {
+ return tester.nodeRepository().getNodes(applicationId, nodeState);
+ }
+
+ boolean isNodeRetired(Node node) {
+ return getNode(node.hostname()).get().allocation().get().membership().retired();
+ }
+
+ Optional<Node> getNode(String hostname) { return tester.nodeRepository().getNode(hostname); }
+
+ List<Node> getNodes(Node.State nodeState) { return tester.nodeRepository().getNodes(nodeState); }
+
+ Node getNode(ApplicationId applicationId) { return tester.nodeRepository().getNodes(applicationId).get(0); }
+
+ ManualClock clock() { return tester.clock(); }
+
+ MockDeployer deployer() { return deployer; }
+
+ private FlavorsConfig flavorsConfig() {
+ FlavorConfigBuilder b = new FlavorConfigBuilder();
+ b.addFlavor("flat", 30, 30, 400, 3, Flavor.Type.BARE_METAL);
+ b.addFlavor("cpu", 40, 20, 400, 3, Flavor.Type.BARE_METAL);
+ b.addFlavor("mem", 20, 40, 400, 3, Flavor.Type.BARE_METAL);
+ return b.build();
+ }
+
+ private static ClusterSpec clusterSpec(String clusterId) {
+ return ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from(clusterId)).vespaVersion("6.42").build();
+ }
+
+ private static ApplicationId makeApplicationId(String tenant, String appName) {
+ return ApplicationId.from(tenant, appName, "default");
+ }
- private FlavorsConfig flavorsConfig() {
- FlavorConfigBuilder b = new FlavorConfigBuilder();
- b.addFlavor("flt", 30, 30, 400, 3, Flavor.Type.BARE_METAL);
- b.addFlavor("cpu", 40, 20, 400, 3, Flavor.Type.BARE_METAL);
- b.addFlavor("mem", 20, 40, 400, 3, Flavor.Type.BARE_METAL);
- return b.build();
}
}