aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository/src
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2018-08-01 17:10:15 +0200
committerMartin Polden <mpolden@mpolden.no>2018-08-02 08:18:53 +0200
commit06ad98ef6c60e875733c8345baf767fbff71eb57 (patch)
tree4e17795603cd670d55eaff33056b68772e586a81 /node-repository/src
parent7e6b0646c8567dfdd7b89295f9c288dce01f8d6f (diff)
Queue all apps eligible for redeployment on each run
Diffstat (limited to 'node-repository/src')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java34
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java35
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java15
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainerTest.java118
4 files changed, 133 insertions, 69 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java
index 3b7c4857f48..e93926e121c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java
@@ -11,10 +11,12 @@ import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import java.time.Duration;
+import java.time.Instant;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
+import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;
import java.util.stream.Collectors;
@@ -26,6 +28,7 @@ import java.util.stream.Collectors;
public abstract class ApplicationMaintainer extends Maintainer {
private final Deployer deployer;
+ private final List<ApplicationId> pendingDeployments = new CopyOnWriteArrayList<>();
// Use a fixed thread pool to avoid overload on config servers. Resource usage when deploying varies
// a lot between applications, so doing one by one avoids issues where one or more resource-demanding
@@ -39,12 +42,15 @@ public abstract class ApplicationMaintainer extends Maintainer {
@Override
protected final void maintain() {
- Set<ApplicationId> applications = applicationsNeedingMaintenance();
- for (ApplicationId application : applications) {
- deploy(application);
- }
+ applicationsNeedingMaintenance().forEach(this::deploy);
+ }
+
+ /** Returns the number of deployments that are pending execution */
+ public int pendingDeployments() {
+ return pendingDeployments.size();
}
+ /** Returns whether given application should be deployed at this moment in time */
protected boolean canDeployNow(ApplicationId application) {
return true;
}
@@ -56,16 +62,21 @@ public abstract class ApplicationMaintainer extends Maintainer {
* even when deployments are slow.
*/
protected void deploy(ApplicationId application) {
+ if (pendingDeployments.contains(application)) {
+ return;// Avoid queuing multiple deployments for same application
+ }
+ log.log(LogLevel.INFO, application + " will be deployed, last deploy time " +
+ getLastDeployTime(application));
deploymentExecutor.execute(() -> deployWithLock(application));
+ pendingDeployments.add(application);
}
protected Deployer deployer() { return deployer; }
-
protected Set<ApplicationId> applicationsNeedingMaintenance() {
return nodesNeedingMaintenance().stream()
- .map(node -> node.allocation().get().owner())
- .collect(Collectors.toCollection(LinkedHashSet::new));
+ .map(node -> node.allocation().get().owner())
+ .collect(Collectors.toCollection(LinkedHashSet::new));
}
/**
@@ -75,7 +86,7 @@ public abstract class ApplicationMaintainer extends Maintainer {
protected abstract List<Node> nodesNeedingMaintenance();
/** Redeploy this application. A lock will be taken for the duration of the deployment activation */
- final void deployWithLock(ApplicationId application) {
+ protected final void deployWithLock(ApplicationId application) {
// An application might change its state between the time the set of applications is retrieved and the
// time deployment happens. Lock the application and check if it's still active.
//
@@ -89,9 +100,16 @@ public abstract class ApplicationMaintainer extends Maintainer {
deployment.get().activate();
} catch (RuntimeException e) {
log.log(LogLevel.WARNING, "Exception on maintenance redeploy", e);
+ } finally {
+ pendingDeployments.remove(application);
}
}
+ /** Returns the last time application was deployed. Epoch is returned if the application has never been deployed. */
+ protected final Instant getLastDeployTime(ApplicationId application) {
+ return deployer.lastDeployTime(application).orElse(Instant.EPOCH);
+ }
+
/** Returns true when application has at least one active node */
private boolean isActive(ApplicationId application) {
return ! nodeRepository().getNodes(application, Node.State.active).isEmpty();
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java
index ee5d6a04ddc..8b2d0a55cd8 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java
@@ -3,17 +3,18 @@ package com.yahoo.vespa.hosted.provision.maintenance;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.Deployer;
-import com.yahoo.log.LogLevel;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
+import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
import java.util.Collections;
import java.util.Comparator;
+import java.util.LinkedHashSet;
import java.util.List;
-import java.util.Optional;
import java.util.Set;
+import java.util.stream.Collectors;
/**
* The application maintainer regularly redeploys all applications to make sure the node repo and application
@@ -23,14 +24,17 @@ import java.util.Set;
* @author bratseth
*/
public class PeriodicApplicationMaintainer extends ApplicationMaintainer {
+
private final Duration minTimeBetweenRedeployments;
+ private final Clock clock;
private final Instant start;
public PeriodicApplicationMaintainer(Deployer deployer, NodeRepository nodeRepository,
Duration interval, Duration minTimeBetweenRedeployments, JobControl jobControl) {
super(deployer, nodeRepository, interval, jobControl);
this.minTimeBetweenRedeployments = minTimeBetweenRedeployments;
- this.start = Instant.now();
+ this.clock = nodeRepository.clock();
+ this.start = clock.instant();
}
@Override
@@ -39,24 +43,17 @@ public class PeriodicApplicationMaintainer extends ApplicationMaintainer {
return getLastDeployTime(application).isBefore(nodeRepository().clock().instant().minus(minTimeBetweenRedeployments));
}
- // Returns the app that was deployed the longest time ago
+ // Returns the applications that need to be redeployed by this config server at this point in time.
@Override
protected Set<ApplicationId> applicationsNeedingMaintenance() {
if (waitInitially()) return Collections.emptySet();
- Optional<ApplicationId> app = (nodesNeedingMaintenance().stream()
- .map(node -> node.allocation().get().owner())
- .distinct()
- .filter(this::shouldBeDeployedOnThisServer)
- .min(Comparator.comparing(this::getLastDeployTime)))
- .filter(this::canDeployNow);
- app.ifPresent(applicationId -> log.log(LogLevel.INFO, applicationId + " will be deployed, last deploy time " +
- getLastDeployTime(applicationId)));
- return app.map(Collections::singleton).orElseGet(Collections::emptySet);
- }
-
- private Instant getLastDeployTime(ApplicationId application) {
- return deployer().lastDeployTime(application).orElse(Instant.EPOCH);
+ return nodesNeedingMaintenance().stream()
+ .map(node -> node.allocation().get().owner())
+ .filter(this::shouldBeDeployedOnThisServer)
+ .filter(this::canDeployNow)
+ .sorted(Comparator.comparing(this::getLastDeployTime))
+ .collect(Collectors.toCollection(LinkedHashSet::new));
}
// We only know last deploy time for applications that were deployed on this config server,
@@ -66,8 +63,8 @@ public class PeriodicApplicationMaintainer extends ApplicationMaintainer {
}
// TODO: Do not start deploying until some time has gone (ideally only until bootstrap of config server is finished)
- protected boolean waitInitially() {
- return Instant.now().isBefore(start.plus(minTimeBetweenRedeployments));
+ private boolean waitInitially() {
+ return clock.instant().isBefore(start.plus(minTimeBetweenRedeployments));
}
@Override
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java
index 99beab50e16..4111a3aee1c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java
@@ -20,6 +20,7 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
+import java.util.concurrent.locks.ReentrantLock;
import java.util.stream.Collectors;
/**
@@ -35,6 +36,7 @@ public class MockDeployer implements Deployer {
public int redeployments = 0;
private final Clock clock;
+ private final ReentrantLock lock = new ReentrantLock();
@Inject
@SuppressWarnings("unused")
@@ -54,6 +56,10 @@ public class MockDeployer implements Deployer {
this.applications = applications;
}
+ public ReentrantLock lock() {
+ return lock;
+ }
+
@Override
public Optional<Deployment> deployFromLocalActive(ApplicationId id, boolean bootstrap) {
return deployFromLocalActive(id, Duration.ofSeconds(60));
@@ -61,8 +67,13 @@ public class MockDeployer implements Deployer {
@Override
public Optional<Deployment> deployFromLocalActive(ApplicationId id, Duration timeout) {
- lastDeployTimes.put(id, clock.instant());
- return Optional.of(new MockDeployment(provisioner, applications.get(id)));
+ try {
+ lock.lock();
+ lastDeployTimes.put(id, clock.instant());
+ return Optional.of(new MockDeployment(provisioner, applications.get(id)));
+ } finally {
+ lock.unlock();
+ }
}
@Override
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainerTest.java
index 2ddb2e0d004..2994657f6e3 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainerTest.java
@@ -41,6 +41,7 @@ import java.util.Map;
import java.util.Optional;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
/**
* @author bratseth
@@ -51,26 +52,31 @@ public class PeriodicApplicationMaintainerTest {
private NodeRepository nodeRepository;
private Fixture fixture;
+ private ManualClock clock;
@Before
public void before() {
Curator curator = new MockCurator();
Zone zone = new Zone(Environment.prod, RegionName.from("us-east"));
- this.nodeRepository = new NodeRepository(nodeFlavors, curator, new ManualClock(), zone,
+ this.clock = new ManualClock();
+ this.nodeRepository = new NodeRepository(nodeFlavors, curator, clock, zone,
new MockNameResolver().mockAnyLookup(),
new DockerImage("docker-registry.domain.tld:8080/dist/vespa"),
true);
this.fixture = new Fixture(zone, nodeRepository, nodeFlavors, curator);
- }
- @Test
- public void test_application_maintenance() {
createReadyNodes(15, nodeRepository, nodeFlavors);
createHostNodes(2, nodeRepository, nodeFlavors);
+ }
+ @Test
+ public void test_application_maintenance() {
// Create applications
fixture.activate();
+ // Exhaust initial wait period
+ clock.advance(Duration.ofMinutes(30).plus(Duration.ofSeconds(1)));
+
// Fail and park some nodes
nodeRepository.fail(nodeRepository.getNodes(fixture.app1).get(3).hostname(), Agent.system, "Failing to unit test");
nodeRepository.fail(nodeRepository.getNodes(fixture.app2).get(0).hostname(), Agent.system, "Failing to unit test");
@@ -102,7 +108,7 @@ public class PeriodicApplicationMaintainerTest {
0, fixture.getNodes(Node.State.active).retired().size());
// Cause maintenance deployment which will update the applications with the re-activated nodes
- ((ManualClock)nodeRepository.clock()).advance(Duration.ofMinutes(35)); // Otherwise redeploys are inhibited
+ clock.advance(Duration.ofMinutes(35)); // Otherwise redeploys are inhibited
fixture.runApplicationMaintainer();
assertEquals("Superflous content nodes are retired",
reactivatedInApp2, fixture.getNodes(Node.State.active).retired().size());
@@ -112,9 +118,6 @@ public class PeriodicApplicationMaintainerTest {
@Test
public void deleted_application_is_not_reactivated() {
- createReadyNodes(15, nodeRepository, nodeFlavors);
- createHostNodes(2, nodeRepository, nodeFlavors);
-
// Create applications
fixture.activate();
@@ -126,36 +129,75 @@ public class PeriodicApplicationMaintainerTest {
assertEquals(fixture.wantedNodesApp2, nodeRepository.getNodes(fixture.app2, Node.State.inactive).size());
// Nodes belonging to app2 are inactive after maintenance
- fixture.runApplicationMaintainer(Optional.of(frozenActiveNodes));
+ fixture.maintainer.setOverriddenNodesNeedingMaintenance(frozenActiveNodes);
+ fixture.runApplicationMaintainer();
assertEquals("Inactive nodes were incorrectly activated after maintenance", fixture.wantedNodesApp2,
nodeRepository.getNodes(fixture.app2, Node.State.inactive).size());
}
@Test
public void application_deploy_inhibits_redeploy_for_a_while() {
- ManualClock clock = (ManualClock)nodeRepository.clock();
- createReadyNodes(15, nodeRepository, nodeFlavors);
- createHostNodes(2, nodeRepository, nodeFlavors);
-
- // Create applications
fixture.activate();
+
+ // Holds off on deployments a while after starting
+ fixture.runApplicationMaintainer();
+ assertFalse("No deployment expected", fixture.deployer.lastDeployTime(fixture.app1).isPresent());
+ assertFalse("No deployment expected", fixture.deployer.lastDeployTime(fixture.app2).isPresent());
+ // Exhaust initial wait period
+ clock.advance(Duration.ofMinutes(30).plus(Duration.ofSeconds(1)));
+
+ // First deployment of applications
fixture.runApplicationMaintainer();
Instant firstDeployTime = clock.instant();
assertEquals(firstDeployTime, fixture.deployer.lastDeployTime(fixture.app1).get());
assertEquals(firstDeployTime, fixture.deployer.lastDeployTime(fixture.app2).get());
- ((ManualClock) nodeRepository.clock()).advance(Duration.ofMinutes(5));
+ clock.advance(Duration.ofMinutes(5));
fixture.runApplicationMaintainer();
// Too soon: Not redeployed:
assertEquals(firstDeployTime, fixture.deployer.lastDeployTime(fixture.app1).get());
assertEquals(firstDeployTime, fixture.deployer.lastDeployTime(fixture.app2).get());
- ((ManualClock) nodeRepository.clock()).advance(Duration.ofMinutes(30));
+ clock.advance(Duration.ofMinutes(30));
fixture.runApplicationMaintainer();
// Redeployed:
assertEquals(clock.instant(), fixture.deployer.lastDeployTime(fixture.app1).get());
assertEquals(clock.instant(), fixture.deployer.lastDeployTime(fixture.app2).get());
}
+ @Test
+ public void queues_all_eligible_applications_for_deployment() {
+ fixture.activate();
+
+ // Exhaust initial wait period
+ clock.advance(Duration.ofMinutes(30).plus(Duration.ofSeconds(1)));
+
+ // Lock deployer to simulate slow deployments
+ fixture.deployer.lock().lock();
+
+ // Queues all eligible applications
+ assertEquals(2, fixture.maintainer.applicationsNeedingMaintenance().size());
+ fixture.runApplicationMaintainer(false);
+ assertEquals(2, fixture.maintainer.pendingDeployments());
+
+ // Enough time passes to make applications eligible for another periodic deployment
+ clock.advance(Duration.ofMinutes(30).plus(Duration.ofSeconds(1)));
+ fixture.runApplicationMaintainer(false);
+
+ // Deployments are not re-queued as previous deployments are still pending
+ assertEquals(2, fixture.maintainer.pendingDeployments());
+
+ // Slow deployments complete
+ fixture.deployer.lock().unlock();
+ fixture.runApplicationMaintainer();
+ Instant deployTime = clock.instant();
+ assertEquals(deployTime, fixture.deployer.lastDeployTime(fixture.app1).get());
+ assertEquals(deployTime, fixture.deployer.lastDeployTime(fixture.app2).get());
+
+ // Too soon: Already deployed recently
+ clock.advance(Duration.ofMinutes(5));
+ assertEquals(0, fixture.maintainer.applicationsNeedingMaintenance().size());
+ }
+
private void createReadyNodes(int count, NodeRepository nodeRepository, NodeFlavors nodeFlavors) {
List<Node> nodes = new ArrayList<>(count);
for (int i = 0; i < count; i++)
@@ -179,7 +221,7 @@ public class PeriodicApplicationMaintainerTest {
final NodeRepository nodeRepository;
final NodeRepositoryProvisioner provisioner;
final Curator curator;
- final Deployer deployer;
+ final MockDeployer deployer;
final ApplicationId app1 = ApplicationId.from(TenantName.from("foo1"), ApplicationName.from("bar"), InstanceName.from("fuz"));
final ApplicationId app2 = ApplicationId.from(TenantName.from("foo2"), ApplicationName.from("bar"), InstanceName.from("fuz"));
@@ -188,6 +230,8 @@ public class PeriodicApplicationMaintainerTest {
final int wantedNodesApp1 = 5;
final int wantedNodesApp2 = 7;
+ private final TestablePeriodicApplicationMaintainer maintainer;
+
Fixture(Zone zone, NodeRepository nodeRepository, NodeFlavors flavors, Curator curator) {
this.nodeRepository = nodeRepository;
this.curator = curator;
@@ -199,6 +243,8 @@ public class PeriodicApplicationMaintainerTest {
apps.put(app2, new MockDeployer.ApplicationContext(app2, clusterApp2,
Capacity.fromNodeCount(wantedNodesApp2, Optional.of("default"), false, true), 1));
this.deployer = new MockDeployer(provisioner, nodeRepository.clock(), apps);
+ this.maintainer = new TestablePeriodicApplicationMaintainer(deployer, nodeRepository, Duration.ofMinutes(1),
+ Duration.ofMinutes(30));
}
void activate() {
@@ -222,16 +268,12 @@ public class PeriodicApplicationMaintainerTest {
}
void runApplicationMaintainer() {
- runApplicationMaintainer(Optional.empty());
+ runApplicationMaintainer(true);
}
- void runApplicationMaintainer(Optional<List<Node>> overriddenNodesNeedingMaintenance) {
- TestablePeriodicApplicationMaintainer maintainer =
- new TestablePeriodicApplicationMaintainer(deployer, nodeRepository, Duration.ofMinutes(1),
- Duration.ofMinutes(30), overriddenNodesNeedingMaintenance);
- // Need to run twice, as only one app is deployed per run
- maintainer.run();
+ void runApplicationMaintainer(boolean waitForDeployments) {
maintainer.run();
+ while (waitForDeployments && fixture.maintainer.pendingDeployments() != 0);
}
NodeList getNodes(Node.State ... states) {
@@ -240,36 +282,32 @@ public class PeriodicApplicationMaintainerTest {
}
- public static class TestablePeriodicApplicationMaintainer extends PeriodicApplicationMaintainer {
+ private static class TestablePeriodicApplicationMaintainer extends PeriodicApplicationMaintainer {
- private Optional<List<Node>> overriddenNodesNeedingMaintenance;
-
- TestablePeriodicApplicationMaintainer(Deployer deployer, NodeRepository nodeRepository, Duration interval,
- Duration minTimeBetweenRedeployments, Optional<List<Node>> overriddenNodesNeedingMaintenance) {
- super(deployer, nodeRepository, interval, minTimeBetweenRedeployments, new JobControl(nodeRepository.database()));
+ private List<Node> overriddenNodesNeedingMaintenance;
+
+ TestablePeriodicApplicationMaintainer setOverriddenNodesNeedingMaintenance(List<Node> overriddenNodesNeedingMaintenance) {
this.overriddenNodesNeedingMaintenance = overriddenNodesNeedingMaintenance;
+ return this;
}
- @Override
- protected void deploy(ApplicationId application) {
- deployWithLock(application);
+ TestablePeriodicApplicationMaintainer(Deployer deployer, NodeRepository nodeRepository, Duration interval,
+ Duration minTimeBetweenRedeployments) {
+ super(deployer, nodeRepository, interval, minTimeBetweenRedeployments, new JobControl(nodeRepository.database()));
}
@Override
protected List<Node> nodesNeedingMaintenance() {
- if (overriddenNodesNeedingMaintenance.isPresent())
- return overriddenNodesNeedingMaintenance.get();
- return super.nodesNeedingMaintenance();
+ return overriddenNodesNeedingMaintenance != null
+ ? overriddenNodesNeedingMaintenance
+ : super.nodesNeedingMaintenance();
}
+ @Override
protected boolean shouldBeDeployedOnThisServer(ApplicationId application) {
return true;
}
- protected boolean waitInitially() {
- return false;
- }
-
}
}