diff options
author | Martin Polden <mpolden@mpolden.no> | 2018-08-01 17:10:15 +0200 |
---|---|---|
committer | Martin Polden <mpolden@mpolden.no> | 2018-08-02 08:18:53 +0200 |
commit | 06ad98ef6c60e875733c8345baf767fbff71eb57 (patch) | |
tree | 4e17795603cd670d55eaff33056b68772e586a81 /node-repository/src | |
parent | 7e6b0646c8567dfdd7b89295f9c288dce01f8d6f (diff) |
Queue all apps eligible for redeployment on each run
Diffstat (limited to 'node-repository/src')
4 files changed, 133 insertions, 69 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java index 3b7c4857f48..e93926e121c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java @@ -11,10 +11,12 @@ import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import java.time.Duration; +import java.time.Instant; import java.util.LinkedHashSet; import java.util.List; import java.util.Optional; import java.util.Set; +import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.Executor; import java.util.concurrent.Executors; import java.util.stream.Collectors; @@ -26,6 +28,7 @@ import java.util.stream.Collectors; public abstract class ApplicationMaintainer extends Maintainer { private final Deployer deployer; + private final List<ApplicationId> pendingDeployments = new CopyOnWriteArrayList<>(); // Use a fixed thread pool to avoid overload on config servers. Resource usage when deploying varies // a lot between applications, so doing one by one avoids issues where one or more resource-demanding @@ -39,12 +42,15 @@ public abstract class ApplicationMaintainer extends Maintainer { @Override protected final void maintain() { - Set<ApplicationId> applications = applicationsNeedingMaintenance(); - for (ApplicationId application : applications) { - deploy(application); - } + applicationsNeedingMaintenance().forEach(this::deploy); + } + + /** Returns the number of deployments that are pending execution */ + public int pendingDeployments() { + return pendingDeployments.size(); } + /** Returns whether given application should be deployed at this moment in time */ protected boolean canDeployNow(ApplicationId application) { return true; } @@ -56,16 +62,21 @@ public abstract class ApplicationMaintainer extends Maintainer { * even when deployments are slow. */ protected void deploy(ApplicationId application) { + if (pendingDeployments.contains(application)) { + return;// Avoid queuing multiple deployments for same application + } + log.log(LogLevel.INFO, application + " will be deployed, last deploy time " + + getLastDeployTime(application)); deploymentExecutor.execute(() -> deployWithLock(application)); + pendingDeployments.add(application); } protected Deployer deployer() { return deployer; } - protected Set<ApplicationId> applicationsNeedingMaintenance() { return nodesNeedingMaintenance().stream() - .map(node -> node.allocation().get().owner()) - .collect(Collectors.toCollection(LinkedHashSet::new)); + .map(node -> node.allocation().get().owner()) + .collect(Collectors.toCollection(LinkedHashSet::new)); } /** @@ -75,7 +86,7 @@ public abstract class ApplicationMaintainer extends Maintainer { protected abstract List<Node> nodesNeedingMaintenance(); /** Redeploy this application. A lock will be taken for the duration of the deployment activation */ - final void deployWithLock(ApplicationId application) { + protected final void deployWithLock(ApplicationId application) { // An application might change its state between the time the set of applications is retrieved and the // time deployment happens. Lock the application and check if it's still active. // @@ -89,9 +100,16 @@ public abstract class ApplicationMaintainer extends Maintainer { deployment.get().activate(); } catch (RuntimeException e) { log.log(LogLevel.WARNING, "Exception on maintenance redeploy", e); + } finally { + pendingDeployments.remove(application); } } + /** Returns the last time application was deployed. Epoch is returned if the application has never been deployed. */ + protected final Instant getLastDeployTime(ApplicationId application) { + return deployer.lastDeployTime(application).orElse(Instant.EPOCH); + } + /** Returns true when application has at least one active node */ private boolean isActive(ApplicationId application) { return ! nodeRepository().getNodes(application, Node.State.active).isEmpty(); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java index ee5d6a04ddc..8b2d0a55cd8 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java @@ -3,17 +3,18 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.Deployer; -import com.yahoo.log.LogLevel; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; +import java.time.Clock; import java.time.Duration; import java.time.Instant; import java.util.Collections; import java.util.Comparator; +import java.util.LinkedHashSet; import java.util.List; -import java.util.Optional; import java.util.Set; +import java.util.stream.Collectors; /** * The application maintainer regularly redeploys all applications to make sure the node repo and application @@ -23,14 +24,17 @@ import java.util.Set; * @author bratseth */ public class PeriodicApplicationMaintainer extends ApplicationMaintainer { + private final Duration minTimeBetweenRedeployments; + private final Clock clock; private final Instant start; public PeriodicApplicationMaintainer(Deployer deployer, NodeRepository nodeRepository, Duration interval, Duration minTimeBetweenRedeployments, JobControl jobControl) { super(deployer, nodeRepository, interval, jobControl); this.minTimeBetweenRedeployments = minTimeBetweenRedeployments; - this.start = Instant.now(); + this.clock = nodeRepository.clock(); + this.start = clock.instant(); } @Override @@ -39,24 +43,17 @@ public class PeriodicApplicationMaintainer extends ApplicationMaintainer { return getLastDeployTime(application).isBefore(nodeRepository().clock().instant().minus(minTimeBetweenRedeployments)); } - // Returns the app that was deployed the longest time ago + // Returns the applications that need to be redeployed by this config server at this point in time. @Override protected Set<ApplicationId> applicationsNeedingMaintenance() { if (waitInitially()) return Collections.emptySet(); - Optional<ApplicationId> app = (nodesNeedingMaintenance().stream() - .map(node -> node.allocation().get().owner()) - .distinct() - .filter(this::shouldBeDeployedOnThisServer) - .min(Comparator.comparing(this::getLastDeployTime))) - .filter(this::canDeployNow); - app.ifPresent(applicationId -> log.log(LogLevel.INFO, applicationId + " will be deployed, last deploy time " + - getLastDeployTime(applicationId))); - return app.map(Collections::singleton).orElseGet(Collections::emptySet); - } - - private Instant getLastDeployTime(ApplicationId application) { - return deployer().lastDeployTime(application).orElse(Instant.EPOCH); + return nodesNeedingMaintenance().stream() + .map(node -> node.allocation().get().owner()) + .filter(this::shouldBeDeployedOnThisServer) + .filter(this::canDeployNow) + .sorted(Comparator.comparing(this::getLastDeployTime)) + .collect(Collectors.toCollection(LinkedHashSet::new)); } // We only know last deploy time for applications that were deployed on this config server, @@ -66,8 +63,8 @@ public class PeriodicApplicationMaintainer extends ApplicationMaintainer { } // TODO: Do not start deploying until some time has gone (ideally only until bootstrap of config server is finished) - protected boolean waitInitially() { - return Instant.now().isBefore(start.plus(minTimeBetweenRedeployments)); + private boolean waitInitially() { + return clock.instant().isBefore(start.plus(minTimeBetweenRedeployments)); } @Override diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java index 99beab50e16..4111a3aee1c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java @@ -20,6 +20,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.concurrent.locks.ReentrantLock; import java.util.stream.Collectors; /** @@ -35,6 +36,7 @@ public class MockDeployer implements Deployer { public int redeployments = 0; private final Clock clock; + private final ReentrantLock lock = new ReentrantLock(); @Inject @SuppressWarnings("unused") @@ -54,6 +56,10 @@ public class MockDeployer implements Deployer { this.applications = applications; } + public ReentrantLock lock() { + return lock; + } + @Override public Optional<Deployment> deployFromLocalActive(ApplicationId id, boolean bootstrap) { return deployFromLocalActive(id, Duration.ofSeconds(60)); @@ -61,8 +67,13 @@ public class MockDeployer implements Deployer { @Override public Optional<Deployment> deployFromLocalActive(ApplicationId id, Duration timeout) { - lastDeployTimes.put(id, clock.instant()); - return Optional.of(new MockDeployment(provisioner, applications.get(id))); + try { + lock.lock(); + lastDeployTimes.put(id, clock.instant()); + return Optional.of(new MockDeployment(provisioner, applications.get(id))); + } finally { + lock.unlock(); + } } @Override diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainerTest.java index 2ddb2e0d004..2994657f6e3 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainerTest.java @@ -41,6 +41,7 @@ import java.util.Map; import java.util.Optional; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; /** * @author bratseth @@ -51,26 +52,31 @@ public class PeriodicApplicationMaintainerTest { private NodeRepository nodeRepository; private Fixture fixture; + private ManualClock clock; @Before public void before() { Curator curator = new MockCurator(); Zone zone = new Zone(Environment.prod, RegionName.from("us-east")); - this.nodeRepository = new NodeRepository(nodeFlavors, curator, new ManualClock(), zone, + this.clock = new ManualClock(); + this.nodeRepository = new NodeRepository(nodeFlavors, curator, clock, zone, new MockNameResolver().mockAnyLookup(), new DockerImage("docker-registry.domain.tld:8080/dist/vespa"), true); this.fixture = new Fixture(zone, nodeRepository, nodeFlavors, curator); - } - @Test - public void test_application_maintenance() { createReadyNodes(15, nodeRepository, nodeFlavors); createHostNodes(2, nodeRepository, nodeFlavors); + } + @Test + public void test_application_maintenance() { // Create applications fixture.activate(); + // Exhaust initial wait period + clock.advance(Duration.ofMinutes(30).plus(Duration.ofSeconds(1))); + // Fail and park some nodes nodeRepository.fail(nodeRepository.getNodes(fixture.app1).get(3).hostname(), Agent.system, "Failing to unit test"); nodeRepository.fail(nodeRepository.getNodes(fixture.app2).get(0).hostname(), Agent.system, "Failing to unit test"); @@ -102,7 +108,7 @@ public class PeriodicApplicationMaintainerTest { 0, fixture.getNodes(Node.State.active).retired().size()); // Cause maintenance deployment which will update the applications with the re-activated nodes - ((ManualClock)nodeRepository.clock()).advance(Duration.ofMinutes(35)); // Otherwise redeploys are inhibited + clock.advance(Duration.ofMinutes(35)); // Otherwise redeploys are inhibited fixture.runApplicationMaintainer(); assertEquals("Superflous content nodes are retired", reactivatedInApp2, fixture.getNodes(Node.State.active).retired().size()); @@ -112,9 +118,6 @@ public class PeriodicApplicationMaintainerTest { @Test public void deleted_application_is_not_reactivated() { - createReadyNodes(15, nodeRepository, nodeFlavors); - createHostNodes(2, nodeRepository, nodeFlavors); - // Create applications fixture.activate(); @@ -126,36 +129,75 @@ public class PeriodicApplicationMaintainerTest { assertEquals(fixture.wantedNodesApp2, nodeRepository.getNodes(fixture.app2, Node.State.inactive).size()); // Nodes belonging to app2 are inactive after maintenance - fixture.runApplicationMaintainer(Optional.of(frozenActiveNodes)); + fixture.maintainer.setOverriddenNodesNeedingMaintenance(frozenActiveNodes); + fixture.runApplicationMaintainer(); assertEquals("Inactive nodes were incorrectly activated after maintenance", fixture.wantedNodesApp2, nodeRepository.getNodes(fixture.app2, Node.State.inactive).size()); } @Test public void application_deploy_inhibits_redeploy_for_a_while() { - ManualClock clock = (ManualClock)nodeRepository.clock(); - createReadyNodes(15, nodeRepository, nodeFlavors); - createHostNodes(2, nodeRepository, nodeFlavors); - - // Create applications fixture.activate(); + + // Holds off on deployments a while after starting + fixture.runApplicationMaintainer(); + assertFalse("No deployment expected", fixture.deployer.lastDeployTime(fixture.app1).isPresent()); + assertFalse("No deployment expected", fixture.deployer.lastDeployTime(fixture.app2).isPresent()); + // Exhaust initial wait period + clock.advance(Duration.ofMinutes(30).plus(Duration.ofSeconds(1))); + + // First deployment of applications fixture.runApplicationMaintainer(); Instant firstDeployTime = clock.instant(); assertEquals(firstDeployTime, fixture.deployer.lastDeployTime(fixture.app1).get()); assertEquals(firstDeployTime, fixture.deployer.lastDeployTime(fixture.app2).get()); - ((ManualClock) nodeRepository.clock()).advance(Duration.ofMinutes(5)); + clock.advance(Duration.ofMinutes(5)); fixture.runApplicationMaintainer(); // Too soon: Not redeployed: assertEquals(firstDeployTime, fixture.deployer.lastDeployTime(fixture.app1).get()); assertEquals(firstDeployTime, fixture.deployer.lastDeployTime(fixture.app2).get()); - ((ManualClock) nodeRepository.clock()).advance(Duration.ofMinutes(30)); + clock.advance(Duration.ofMinutes(30)); fixture.runApplicationMaintainer(); // Redeployed: assertEquals(clock.instant(), fixture.deployer.lastDeployTime(fixture.app1).get()); assertEquals(clock.instant(), fixture.deployer.lastDeployTime(fixture.app2).get()); } + @Test + public void queues_all_eligible_applications_for_deployment() { + fixture.activate(); + + // Exhaust initial wait period + clock.advance(Duration.ofMinutes(30).plus(Duration.ofSeconds(1))); + + // Lock deployer to simulate slow deployments + fixture.deployer.lock().lock(); + + // Queues all eligible applications + assertEquals(2, fixture.maintainer.applicationsNeedingMaintenance().size()); + fixture.runApplicationMaintainer(false); + assertEquals(2, fixture.maintainer.pendingDeployments()); + + // Enough time passes to make applications eligible for another periodic deployment + clock.advance(Duration.ofMinutes(30).plus(Duration.ofSeconds(1))); + fixture.runApplicationMaintainer(false); + + // Deployments are not re-queued as previous deployments are still pending + assertEquals(2, fixture.maintainer.pendingDeployments()); + + // Slow deployments complete + fixture.deployer.lock().unlock(); + fixture.runApplicationMaintainer(); + Instant deployTime = clock.instant(); + assertEquals(deployTime, fixture.deployer.lastDeployTime(fixture.app1).get()); + assertEquals(deployTime, fixture.deployer.lastDeployTime(fixture.app2).get()); + + // Too soon: Already deployed recently + clock.advance(Duration.ofMinutes(5)); + assertEquals(0, fixture.maintainer.applicationsNeedingMaintenance().size()); + } + private void createReadyNodes(int count, NodeRepository nodeRepository, NodeFlavors nodeFlavors) { List<Node> nodes = new ArrayList<>(count); for (int i = 0; i < count; i++) @@ -179,7 +221,7 @@ public class PeriodicApplicationMaintainerTest { final NodeRepository nodeRepository; final NodeRepositoryProvisioner provisioner; final Curator curator; - final Deployer deployer; + final MockDeployer deployer; final ApplicationId app1 = ApplicationId.from(TenantName.from("foo1"), ApplicationName.from("bar"), InstanceName.from("fuz")); final ApplicationId app2 = ApplicationId.from(TenantName.from("foo2"), ApplicationName.from("bar"), InstanceName.from("fuz")); @@ -188,6 +230,8 @@ public class PeriodicApplicationMaintainerTest { final int wantedNodesApp1 = 5; final int wantedNodesApp2 = 7; + private final TestablePeriodicApplicationMaintainer maintainer; + Fixture(Zone zone, NodeRepository nodeRepository, NodeFlavors flavors, Curator curator) { this.nodeRepository = nodeRepository; this.curator = curator; @@ -199,6 +243,8 @@ public class PeriodicApplicationMaintainerTest { apps.put(app2, new MockDeployer.ApplicationContext(app2, clusterApp2, Capacity.fromNodeCount(wantedNodesApp2, Optional.of("default"), false, true), 1)); this.deployer = new MockDeployer(provisioner, nodeRepository.clock(), apps); + this.maintainer = new TestablePeriodicApplicationMaintainer(deployer, nodeRepository, Duration.ofMinutes(1), + Duration.ofMinutes(30)); } void activate() { @@ -222,16 +268,12 @@ public class PeriodicApplicationMaintainerTest { } void runApplicationMaintainer() { - runApplicationMaintainer(Optional.empty()); + runApplicationMaintainer(true); } - void runApplicationMaintainer(Optional<List<Node>> overriddenNodesNeedingMaintenance) { - TestablePeriodicApplicationMaintainer maintainer = - new TestablePeriodicApplicationMaintainer(deployer, nodeRepository, Duration.ofMinutes(1), - Duration.ofMinutes(30), overriddenNodesNeedingMaintenance); - // Need to run twice, as only one app is deployed per run - maintainer.run(); + void runApplicationMaintainer(boolean waitForDeployments) { maintainer.run(); + while (waitForDeployments && fixture.maintainer.pendingDeployments() != 0); } NodeList getNodes(Node.State ... states) { @@ -240,36 +282,32 @@ public class PeriodicApplicationMaintainerTest { } - public static class TestablePeriodicApplicationMaintainer extends PeriodicApplicationMaintainer { + private static class TestablePeriodicApplicationMaintainer extends PeriodicApplicationMaintainer { - private Optional<List<Node>> overriddenNodesNeedingMaintenance; - - TestablePeriodicApplicationMaintainer(Deployer deployer, NodeRepository nodeRepository, Duration interval, - Duration minTimeBetweenRedeployments, Optional<List<Node>> overriddenNodesNeedingMaintenance) { - super(deployer, nodeRepository, interval, minTimeBetweenRedeployments, new JobControl(nodeRepository.database())); + private List<Node> overriddenNodesNeedingMaintenance; + + TestablePeriodicApplicationMaintainer setOverriddenNodesNeedingMaintenance(List<Node> overriddenNodesNeedingMaintenance) { this.overriddenNodesNeedingMaintenance = overriddenNodesNeedingMaintenance; + return this; } - @Override - protected void deploy(ApplicationId application) { - deployWithLock(application); + TestablePeriodicApplicationMaintainer(Deployer deployer, NodeRepository nodeRepository, Duration interval, + Duration minTimeBetweenRedeployments) { + super(deployer, nodeRepository, interval, minTimeBetweenRedeployments, new JobControl(nodeRepository.database())); } @Override protected List<Node> nodesNeedingMaintenance() { - if (overriddenNodesNeedingMaintenance.isPresent()) - return overriddenNodesNeedingMaintenance.get(); - return super.nodesNeedingMaintenance(); + return overriddenNodesNeedingMaintenance != null + ? overriddenNodesNeedingMaintenance + : super.nodesNeedingMaintenance(); } + @Override protected boolean shouldBeDeployedOnThisServer(ApplicationId application) { return true; } - protected boolean waitInitially() { - return false; - } - } } |