diff options
author | Håkon Hallingstad <hakon@yahoo-inc.com> | 2017-05-26 17:36:03 +0200 |
---|---|---|
committer | Håkon Hallingstad <hakon@yahoo-inc.com> | 2017-05-26 17:36:03 +0200 |
commit | b4d42768c75fd78b26a0999cb5eeb376d0cf8da8 (patch) | |
tree | c4c8c4e9f42e8b900b358bf26c4f0136090a26c2 /node-repository | |
parent | 085709b42ff698bd6db8f1231fa8d93fadc0e86c (diff) |
Complete retirement early in dev CD
Diffstat (limited to 'node-repository')
5 files changed, 178 insertions, 2 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index c5aa78fefad..d35a9c158b7 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -40,6 +40,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { private final ReservationExpirer reservationExpirer; private final InactiveExpirer inactiveExpirer; private final RetiredExpirer retiredExpirer; + private final RetiredEarlyExpirer retiredEarlyExpirer; private final FailedExpirer failedExpirer; private final DirtyExpirer dirtyExpirer; private final NodeRebooter nodeRebooter; @@ -66,6 +67,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { zooKeeperAccessMaintainer = new ZooKeeperAccessMaintainer(nodeRepository, curator, durationFromEnv("zookeeper_access_maintenance_interval").orElse(defaults.zooKeeperAccessMaintenanceInterval), jobControl); reservationExpirer = new ReservationExpirer(nodeRepository, clock, durationFromEnv("reservation_expiry").orElse(defaults.reservationExpiry), jobControl); retiredExpirer = new RetiredExpirer(nodeRepository, deployer, clock, durationFromEnv("retired_expiry").orElse(defaults.retiredExpiry), jobControl); + retiredEarlyExpirer = new RetiredEarlyExpirer(nodeRepository, zone, durationFromEnv("retired_early_interval").orElse(defaults.retiredEarlyInterval), jobControl, deployer, orchestrator); inactiveExpirer = new InactiveExpirer(nodeRepository, clock, durationFromEnv("inactive_expiry").orElse(defaults.inactiveExpiry), jobControl); failedExpirer = new FailedExpirer(nodeRepository, zone, clock, durationFromEnv("failed_expiry").orElse(defaults.failedExpiry), jobControl); dirtyExpirer = new DirtyExpirer(nodeRepository, clock, durationFromEnv("dirty_expiry").orElse(defaults.dirtyExpiry), jobControl); @@ -91,6 +93,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { reservationExpirer.deconstruct(); inactiveExpirer.deconstruct(); retiredExpirer.deconstruct(); + retiredEarlyExpirer.deconstruct(); failedExpirer.deconstruct(); dirtyExpirer.deconstruct(); nodeRebooter.deconstruct(); @@ -135,6 +138,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { private final Duration rebootInterval; private final Duration nodeRetirerInterval; private final Duration metricsInterval; + private final Duration retiredEarlyInterval; private final NodeFailer.ThrottlePolicy throttlePolicy; @@ -149,6 +153,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { reservationExpiry = Duration.ofMinutes(20); // same as deployment timeout inactiveExpiry = Duration.ofHours(4); // enough time for the application owner to discover and redeploy retiredExpiry = Duration.ofDays(4); // enough time to migrate data + retiredEarlyInterval = Duration.ofMinutes(29); failedExpiry = Duration.ofDays(4); // enough time to recover data even if it happens friday night dirtyExpiry = Duration.ofHours(2); // enough time to clean the node rebootInterval = Duration.ofDays(30); @@ -165,6 +170,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { reservationExpiry = Duration.ofMinutes(10); // Need to be long enough for deployment to be finished for all config model versions inactiveExpiry = Duration.ofSeconds(2); // support interactive wipe start over retiredExpiry = Duration.ofMinutes(1); + retiredEarlyInterval = Duration.ofMinutes(5); failedExpiry = Duration.ofMinutes(10); dirtyExpiry = Duration.ofMinutes(30); rebootInterval = Duration.ofDays(30); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredEarlyExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredEarlyExpirer.java new file mode 100644 index 00000000000..f936c1e06ba --- /dev/null +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredEarlyExpirer.java @@ -0,0 +1,98 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.provision.maintenance; + +import com.yahoo.collections.ListMap; +import com.yahoo.config.provision.ApplicationId; +import com.yahoo.config.provision.Deployer; +import com.yahoo.config.provision.Deployment; +import com.yahoo.config.provision.Environment; +import com.yahoo.config.provision.RegionName; +import com.yahoo.config.provision.SystemName; +import com.yahoo.config.provision.Zone; +import com.yahoo.vespa.applicationmodel.HostName; +import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.orchestrator.OrchestrationException; +import com.yahoo.vespa.orchestrator.Orchestrator; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.logging.Level; +import java.util.stream.Collectors; + +public class RetiredEarlyExpirer extends Maintainer { + private final Deployer deployer; + private final Orchestrator orchestrator; + + public RetiredEarlyExpirer(NodeRepository nodeRepository, + Zone zone, + Duration interval, + JobControl jobControl, + Deployer deployer, + Orchestrator orchestrator) { + super(nodeRepository, interval, jobControl); + this.deployer = deployer; + this.orchestrator = orchestrator; + + List<Zone> applies = Arrays.asList(new Zone(SystemName.cd, Environment.dev, RegionName.from("cd-us-central-1"))); + if (!applies.contains(zone)) { + String targetZones = applies.stream().map(Zone::toString).collect(Collectors.joining(", ")); + log.info(RetiredEarlyExpirer.class.getName() + " only runs in " + targetZones + ", stopping."); + deconstruct(); + } + } + + @Override + protected void maintain() { + List<Node> activeNodes = nodeRepository().getNodes(Node.State.active); + + ListMap<ApplicationId, Node> retiredNodesByApplication = new ListMap<>(); + for (Node node : activeNodes) { + if (node.allocation().isPresent() && node.allocation().get().membership().retired()) { + retiredNodesByApplication.put(node.allocation().get().owner(), node); + } + } + + for (Map.Entry<ApplicationId, List<Node>> entry : retiredNodesByApplication.entrySet()) { + ApplicationId application = entry.getKey(); + List<Node> retiredNodes = entry.getValue(); + + try { + Optional<Deployment> deployment = deployer.deployFromLocalActive(application, Duration.ofMinutes(30)); + if ( ! deployment.isPresent()) continue; // this will be done at another config server + + List<Node> nodesToRemove = new ArrayList<>(); + for (Node node : retiredNodes) { + if (nodeCanBeRemoved(node)) { + nodesToRemove.add(node); + } + } + + nodeRepository().setRemovable(application, nodesToRemove); + + deployment.get().activate(); + + String nodeList = nodesToRemove.stream().map(Node::hostname).collect(Collectors.joining(", ")); + log.info("Redeployed " + application + " to deactivate retired nodes: " + nodeList); + } catch (RuntimeException e) { + String nodeList = retiredNodes.stream().map(Node::hostname).collect(Collectors.joining(", ")); + log.log(Level.WARNING, "Exception trying to deactivate retired nodes from " + application + + ": " + nodeList, e); + } + } + } + + boolean nodeCanBeRemoved(Node node) { + try { + orchestrator.acquirePermissionToRemove(new HostName(node.hostname())); + return true; + } catch (OrchestrationException e) { + log.info("Did not get permission to remove retired node " + node + ": " + e.getMessage()); + return false; + } + } +} diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/OrchestratorMock.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/OrchestratorMock.java index b29b5ee813e..538feeb042d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/OrchestratorMock.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/OrchestratorMock.java @@ -7,6 +7,7 @@ import com.yahoo.vespa.orchestrator.ApplicationStateChangeDeniedException; import com.yahoo.vespa.orchestrator.BatchHostNameNotFoundException; import com.yahoo.vespa.orchestrator.BatchInternalErrorException; import com.yahoo.vespa.orchestrator.HostNameNotFoundException; +import com.yahoo.vespa.orchestrator.OrchestrationException; import com.yahoo.vespa.orchestrator.Orchestrator; import com.yahoo.vespa.orchestrator.model.NodeGroup; import com.yahoo.vespa.orchestrator.policy.BatchHostStateChangeDeniedException; @@ -62,6 +63,9 @@ public class OrchestratorMock implements Orchestrator { } @Override + public void acquirePermissionToRemove(HostName hostName) throws OrchestrationException {} + + @Override public void suspendAll(HostName parentHostname, List<HostName> hostNames) throws BatchInternalErrorException, BatchHostStateChangeDeniedException, BatchHostNameNotFoundException { throw new UnsupportedOperationException("Not implemented"); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirerTest.java index f034490b3f7..9bfeccb9a5d 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirerTest.java @@ -9,6 +9,7 @@ import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.HostSpec; import com.yahoo.config.provision.InstanceName; +import com.yahoo.config.provision.NodeFlavors; import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.RegionName; import com.yahoo.config.provision.TenantName; @@ -20,11 +21,12 @@ import com.yahoo.vespa.curator.mock.MockCurator; import com.yahoo.vespa.curator.transaction.CuratorTransaction; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; -import com.yahoo.config.provision.NodeFlavors; import com.yahoo.vespa.hosted.provision.provisioning.NodeRepositoryProvisioner; import com.yahoo.vespa.hosted.provision.testutils.FlavorConfigBuilder; import com.yahoo.vespa.hosted.provision.testutils.MockDeployer; import com.yahoo.vespa.hosted.provision.testutils.MockNameResolver; +import com.yahoo.vespa.orchestrator.OrchestrationException; +import com.yahoo.vespa.orchestrator.Orchestrator; import org.junit.Test; import java.time.Duration; @@ -35,6 +37,11 @@ import java.util.Optional; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import static org.mockito.Matchers.any; +import static org.mockito.Mockito.doNothing; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; /** * @author bratseth @@ -58,7 +65,7 @@ public class RetiredExpirerTest { ApplicationId applicationId = ApplicationId.from(TenantName.from("foo"), ApplicationName.from("bar"), InstanceName.from("fuz")); // Allocate content cluster of sizes 7 -> 2 -> 3: - // Should end up with 3 nodes in the cluster (one previously retired), and 3 retired + // Should end up with 3 nodes in the cluster (one previously retired), and 4 retired ClusterSpec cluster = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("test"), Version.fromString("6.42")); int wantedNodes; activate(applicationId, cluster, wantedNodes=7, 1, provisioner); @@ -117,6 +124,64 @@ public class RetiredExpirerTest { assertFalse(node.allocation().get().membership().retired()); } + @Test + public void ensure_early_inactivation() throws OrchestrationException { + ManualClock clock = new ManualClock(); + Zone zone = new Zone(Environment.prod, RegionName.from("us-east")); + NodeFlavors nodeFlavors = FlavorConfigBuilder.createDummies("default"); + NodeRepository nodeRepository = new NodeRepository(nodeFlavors, curator, clock, zone, + new MockNameResolver().mockAnyLookup()); + NodeRepositoryProvisioner provisioner = new NodeRepositoryProvisioner(nodeRepository, nodeFlavors, zone); + + createReadyNodes(7, nodeRepository, nodeFlavors); + createHostNodes(4, nodeRepository, nodeFlavors); + + ApplicationId applicationId = ApplicationId.from(TenantName.from("foo"), ApplicationName.from("bar"), InstanceName.from("fuz")); + + // Allocate content cluster of sizes 7 -> 2 -> 3: + // Should end up with 3 nodes in the cluster (one previously retired), and 4 retired + ClusterSpec cluster = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("test"), Version.fromString("6.42")); + int wantedNodes; + activate(applicationId, cluster, wantedNodes=7, 1, provisioner); + activate(applicationId, cluster, wantedNodes=2, 1, provisioner); + activate(applicationId, cluster, wantedNodes=3, 1, provisioner); + assertEquals(7, nodeRepository.getNodes(applicationId, Node.State.active).size()); + assertEquals(0, nodeRepository.getNodes(applicationId, Node.State.inactive).size()); + + // Cause inactivation of retired nodes + clock.advance(Duration.ofHours(30)); // Retire period spent + MockDeployer deployer = + new MockDeployer(provisioner, + Collections.singletonMap( + applicationId, + new MockDeployer.ApplicationContext(applicationId, cluster, Capacity.fromNodeCount(wantedNodes, Optional.of("default")), 1))); + + Orchestrator orchestrator = mock(Orchestrator.class); + // Allow the 1st and 3rd retired nodes permission to inactivate + doNothing() + .doThrow(new OrchestrationException("Permission not granted 1")) + .doNothing() + .doThrow(new OrchestrationException("Permission not granted 2")) + .when(orchestrator).acquirePermissionToRemove(any()); + + new RetiredEarlyExpirer( + nodeRepository, + zone, + Duration.ofDays(30), + new JobControl(nodeRepository.database()), + deployer, + orchestrator).run(); + assertEquals(5, nodeRepository.getNodes(applicationId, Node.State.active).size()); + assertEquals(2, nodeRepository.getNodes(applicationId, Node.State.inactive).size()); + assertEquals(1, deployer.redeployments); + + verify(orchestrator, times(4)).acquirePermissionToRemove(any()); + + // inactivated nodes are not retired + for (Node node : nodeRepository.getNodes(applicationId, Node.State.inactive)) + assertFalse(node.allocation().get().membership().retired()); + } + private void activate(ApplicationId applicationId, ClusterSpec cluster, int nodes, int groups, NodeRepositoryProvisioner provisioner) { List<HostSpec> hosts = provisioner.prepare(applicationId, cluster, Capacity.fromNodeCount(nodes), groups, null); NestedTransaction transaction = new NestedTransaction().add(new CuratorTransaction(curator)); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/responses/maintenance.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/responses/maintenance.json index a7a2fe6b677..fea4fb8d4d2 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/responses/maintenance.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/responses/maintenance.json @@ -28,6 +28,9 @@ "name":"OperatorChangeApplicationMaintainer" }, { + "name":"RetiredEarlyExpirer" + }, + { "name":"MetricsReporter" }, { |