diff options
author | Martin Polden <martin.polden@gmail.com> | 2017-02-08 13:04:00 +0100 |
---|---|---|
committer | Martin Polden <martin.polden@gmail.com> | 2017-02-08 13:15:24 +0100 |
commit | 3a1362d08027b36ec20e29a4ab570864f793abdf (patch) | |
tree | f9a6fba2981dc3ab98131d0c5b209b9ea6d9b1ed /node-repository | |
parent | 1d2fda3acf3035d628aa71bcb82bbcb88f09d5ad (diff) |
Fix race condition in application maintainer
Diffstat (limited to 'node-repository')
2 files changed, 86 insertions, 20 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java index 32478473111..2d5b5be4f37 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java @@ -4,13 +4,14 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.Deployment; -import com.yahoo.config.provision.NodeType; +import com.yahoo.transaction.Mutex; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import java.time.Duration; import java.util.Optional; import java.util.Set; +import java.util.function.Function; import java.util.logging.Level; import java.util.stream.Collectors; @@ -26,24 +27,36 @@ import java.util.stream.Collectors; public class ApplicationMaintainer extends Maintainer { private final Deployer deployer; + private final Function<NodeRepository, Set<ApplicationId>> activeApplicationsGetter; public ApplicationMaintainer(Deployer deployer, NodeRepository nodeRepository, Duration rate) { + this(deployer, nodeRepository, rate, ApplicationMaintainer::getActiveApplications); + } + + ApplicationMaintainer(Deployer deployer, NodeRepository nodeRepository, Duration rate, + Function<NodeRepository, Set<ApplicationId>> activeApplicationsGetter) { super(nodeRepository, rate); this.deployer = deployer; + this.activeApplicationsGetter = activeApplicationsGetter; } @Override protected void maintain() { - Set<ApplicationId> applications = - nodeRepository().getNodes(Node.State.active).stream().map(node -> node.allocation().get().owner()).collect(Collectors.toSet()); + Set<ApplicationId> applications = activeApplicationsGetter.apply(nodeRepository()); for (ApplicationId application : applications) { try { - Optional<Deployment> deployment = deployer.deployFromLocalActive(application, Duration.ofMinutes(30)); - if ( ! deployment.isPresent()) continue; // this will be done at another config server + // An application might change it's state between the time the set of applications is retrieved and the + // time deployment happens. Lock on application and check if it's still active + try (Mutex lock = nodeRepository().lock(application)) { + if (isApplicationActive(application)) { + Optional<Deployment> deployment = deployer.deployFromLocalActive(application, Duration.ofMinutes(30)); + if ( ! deployment.isPresent()) continue; // this will be done at another config server - deployment.get().prepare(); - deployment.get().activate(); + deployment.get().prepare(); + deployment.get().activate(); + } + } } catch (RuntimeException e) { log.log(Level.WARNING, "Exception on maintenance redeploy of " + application, e); @@ -54,4 +67,14 @@ public class ApplicationMaintainer extends Maintainer { @Override public String toString() { return "Periodic application redeployer"; } + private boolean isApplicationActive(ApplicationId application) { + return !nodeRepository().getNodes(application, Node.State.active).isEmpty(); + } + + static Set<ApplicationId> getActiveApplications(NodeRepository nodeRepository) { + return nodeRepository.getNodes(Node.State.active).stream() + .map(node -> node.allocation().get().owner()) + .collect(Collectors.toSet()); + } + } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainerTest.java index e2bf9ede545..549ea90a4e2 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainerTest.java @@ -8,6 +8,7 @@ import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.HostSpec; import com.yahoo.config.provision.InstanceName; +import com.yahoo.config.provision.NodeFlavors; import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.RegionName; import com.yahoo.config.provision.TenantName; @@ -16,14 +17,14 @@ import com.yahoo.test.ManualClock; import com.yahoo.transaction.NestedTransaction; import com.yahoo.vespa.curator.Curator; import com.yahoo.vespa.curator.mock.MockCurator; +import com.yahoo.vespa.curator.transaction.CuratorTransaction; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; -import com.yahoo.config.provision.NodeFlavors; import com.yahoo.vespa.hosted.provision.provisioning.NodeRepositoryProvisioner; -import com.yahoo.vespa.curator.transaction.CuratorTransaction; import com.yahoo.vespa.hosted.provision.testutils.FlavorConfigBuilder; import com.yahoo.vespa.hosted.provision.testutils.MockNameResolver; +import org.junit.Before; import org.junit.Test; import java.time.Duration; @@ -32,6 +33,9 @@ import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; import static org.junit.Assert.assertEquals; @@ -40,21 +44,25 @@ import static org.junit.Assert.assertEquals; */ public class ApplicationMaintainerTest { - private Curator curator = new MockCurator(); + private static final NodeFlavors nodeFlavors = FlavorConfigBuilder.createDummies("default"); - @Test - public void test_application_maintenance() throws InterruptedException { - ManualClock clock = new ManualClock(); + private NodeRepository nodeRepository; + private Fixture fixture; + + @Before + public void before() { + Curator curator = new MockCurator(); Zone zone = new Zone(Environment.prod, RegionName.from("us-east")); - NodeFlavors nodeFlavors = FlavorConfigBuilder.createDummies("default"); - NodeRepository nodeRepository = new NodeRepository(nodeFlavors, curator, clock, zone, - new MockNameResolver().mockAnyLookup()); + this.nodeRepository = new NodeRepository(nodeFlavors, curator, new ManualClock(), zone, + new MockNameResolver().mockAnyLookup()); + this.fixture = new Fixture(zone, nodeRepository, nodeFlavors, curator); + } + @Test + public void test_application_maintenance() throws InterruptedException { createReadyNodes(15, nodeRepository, nodeFlavors); createHostNodes(2, nodeRepository, nodeFlavors); - Fixture fixture = new Fixture(zone, nodeRepository, nodeFlavors); - // Create applications fixture.activate(); @@ -96,6 +104,29 @@ public class ApplicationMaintainerTest { reactivatedInApp1, fixture.getNodes(Node.State.inactive).size()); } + @Test + public void deleted_application_is_not_reactivated() { + createReadyNodes(15, nodeRepository, nodeFlavors); + createHostNodes(2, nodeRepository, nodeFlavors); + + // Create applications + fixture.activate(); + + // Freeze active nodes to simulate an application being deleted during a maintenance run + Set<ApplicationId> frozenActiveApplications = nodeRepository.getNodes(Node.State.active).stream() + .map(n -> n.allocation().get().owner()) + .collect(Collectors.toSet()); + + // Remove one application without letting the application maintainer know about it + fixture.remove(fixture.app2); + assertEquals(fixture.wantedNodesApp2, nodeRepository.getNodes(fixture.app2, Node.State.inactive).size()); + + // Nodes belonging to app2 are inactive after maintenance + fixture.runApplicationMaintainer((ignored) -> frozenActiveApplications); + assertEquals("Inactive nodes were incorrectly activated after maintenance", fixture.wantedNodesApp2, + nodeRepository.getNodes(fixture.app2, Node.State.inactive).size()); + } + private void createReadyNodes(int count, NodeRepository nodeRepository, NodeFlavors nodeFlavors) { List<Node> nodes = new ArrayList<>(count); for (int i = 0; i < count; i++) @@ -118,6 +149,7 @@ public class ApplicationMaintainerTest { final NodeRepository nodeRepository; final NodeRepositoryProvisioner provisioner; + final Curator curator; final ApplicationId app1 = ApplicationId.from(TenantName.from("foo1"), ApplicationName.from("bar"), InstanceName.from("fuz")); final ApplicationId app2 = ApplicationId.from(TenantName.from("foo2"), ApplicationName.from("bar"), InstanceName.from("fuz")); @@ -126,8 +158,9 @@ public class ApplicationMaintainerTest { final int wantedNodesApp1 = 5; final int wantedNodesApp2 = 7; - Fixture(Zone zone, NodeRepository nodeRepository, NodeFlavors flavors) { + Fixture(Zone zone, NodeRepository nodeRepository, NodeFlavors flavors, Curator curator) { this.nodeRepository = nodeRepository; + this.curator = curator; this.provisioner = new NodeRepositoryProvisioner(nodeRepository, flavors, zone); } @@ -145,14 +178,24 @@ public class ApplicationMaintainerTest { transaction.commit(); } + void remove(ApplicationId application) { + NestedTransaction transaction = new NestedTransaction().add(new CuratorTransaction(curator)); + provisioner.remove(transaction, application); + transaction.commit(); + } + void runApplicationMaintainer() { + runApplicationMaintainer(ApplicationMaintainer::getActiveApplications); + } + + void runApplicationMaintainer(Function<NodeRepository, Set<ApplicationId>> activeApplicationsGetter) { Map<ApplicationId, MockDeployer.ApplicationContext> apps = new HashMap<>(); apps.put(app1, new MockDeployer.ApplicationContext(app1, clusterApp1, Capacity.fromNodeCount(wantedNodesApp1, Optional.of("default")), 1)); apps.put(app2, new MockDeployer.ApplicationContext(app2, clusterApp2, Capacity.fromNodeCount(wantedNodesApp2, Optional.of("default")), 1)); MockDeployer deployer = new MockDeployer(provisioner, apps); - new ApplicationMaintainer(deployer, nodeRepository, Duration.ofMinutes(30)).run(); + new ApplicationMaintainer(deployer, nodeRepository, Duration.ofMinutes(30), activeApplicationsGetter).run(); } NodeList getNodes(Node.State ... states) { |