diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-23 15:40:40 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-23 15:40:40 +0200 |
commit | c2b6d55eaffe5ee072294a7a23d95a7406f92d6b (patch) | |
tree | 719f0e20285021a0b2bbfa62afd257525a764477 /node-repository | |
parent | 3336d15a4b569991877ad806c26210b9e6e2e001 (diff) |
Add support for parking nodes
Diffstat (limited to 'node-repository')
14 files changed, 78 insertions, 41 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java index d4a955cf746..a88fb016e23 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java @@ -244,11 +244,18 @@ public final class Node { dirty, /** This node has failed and must be repaired or removed. The node retains any allocation data for diagnosis. */ - failed; + failed, + + /** + * This node should not currently be used. + * This state follows the same rules as failed except that it will never be automatically moved out of + * this state. + */ + parked; /** Returns whether this is a state where the node is assigned to an application */ public boolean isAllocated() { - return this == reserved || this == active || this == inactive || this == failed; + return this == reserved || this == active || this == inactive || this == failed || this == parked; } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java index aff65652399..11892c69c1c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java @@ -49,7 +49,7 @@ import java.util.stream.Collectors; // 1) (new) - > provisioned -> ready -> reserved -> active -> inactive -> dirty -> ready // 2) inactive -> reserved // 3) reserved -> dirty -// 3) * -> failed -> dirty | active | (removed) +// 3) * -> failed | parked -> dirty | active | (removed) // Nodes have an application assigned when in states reserved, active and inactive. // Nodes might have an application assigned in dirty. public class NodeRepository extends AbstractComponent { @@ -180,11 +180,16 @@ public class NodeRepository extends AbstractComponent { return performOn(NodeListFilter.from(nodes), node -> zkClient.writeTo(Node.State.dirty, node)); } - /** Deallocate a node which is in the failed state. Use this to recycle failed nodes which have been repaired. */ + /** + * Deallocate a node which is in the failed or parked state. + * Use this to recycle failed nodes which have been repaired or put on hold. + */ public Node deallocate(String hostname) { Optional<Node> nodeToDeallocate = getNode(Node.State.failed, hostname); if ( ! nodeToDeallocate.isPresent()) - throw new IllegalArgumentException("Could not deallocate " + hostname + ": Node not found in the failed state"); + nodeToDeallocate = getNode(Node.State.parked, hostname); + if ( ! nodeToDeallocate.isPresent()) + throw new IllegalArgumentException("Could not deallocate " + hostname + ": No such node in the failed or parked state"); return deallocate(Collections.singletonList(nodeToDeallocate.get())).get(0); } @@ -199,12 +204,22 @@ public class NodeRepository extends AbstractComponent { } /** - * Moves a previously failed node back to the active state. + * Parks this node and returns it in its new state. + * + * @return the node in its new state + * @throws IllegalArgumentException if the node is not found + */ + public Node park(String hostname) { + return move(hostname, Node.State.parked); + } + + /** + * Moves a previously failed or parked node back to the active state. * * @return the node in its new state * @throws IllegalArgumentException if the node is not found */ - public Node unfail(String hostname) { + public Node reactivate(String hostname) { return move(hostname, Node.State.active); } @@ -218,15 +233,18 @@ public class NodeRepository extends AbstractComponent { } /** - * Removes a node. A node must be in the failed state before it can be removed. + * Removes a node. A node must be in the failed or parked state before it can be removed. * * @return true if the node was removed, false if it was not found */ public boolean remove(String hostname) { Optional<Node> nodeToRemove = getNode(Node.State.failed, hostname); - if ( ! nodeToRemove.isPresent()) return false; + if ( ! nodeToRemove.isPresent()) + nodeToRemove = getNode(Node.State.parked, hostname); + if ( ! nodeToRemove.isPresent()) + return false; try (Mutex lock = lock(nodeToRemove.get())) { - return zkClient.removeNode(Node.State.failed, hostname); + return zkClient.removeNode(nodeToRemove.get().state(), hostname); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java index d12c7f2a5ae..ce7ae429c40 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java @@ -18,7 +18,7 @@ import java.util.stream.Collectors; * The application maintainer regularly redeploys all applications. * This is necessary because applications may gain and lose active nodes due to nodes being moved to and from the * failed state. This is corrected by redeploying the applications periodically. - * It can not (at this point) be done reliably synchronously as part of the fail/unfail call due to the need for this + * It can not (at this point) be done reliably synchronously as part of the fail/reactivate call due to the need for this * to happen at a node having the deployer. * * @author bratseth diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index c18aacea284..d02a66ec686 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -3,7 +3,6 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.Deployment; -import com.yahoo.log.LogLevel; import com.yahoo.transaction.Mutex; import com.yahoo.vespa.applicationmodel.ApplicationInstance; import com.yahoo.vespa.applicationmodel.ServiceCluster; @@ -147,7 +146,7 @@ public class NodeFailer extends Maintainer { catch (RuntimeException e) { // The expected reason for deployment to fail here is that there is no capacity available to redeploy. // In that case we should leave the node in the active state to avoid failing additional nodes. - nodeRepository().unfail(node.hostname()); + nodeRepository().reactivate(node.hostname()); log.log(Level.WARNING, "Attempted to fail " + node + " for " + node.allocation().get().owner() + ", but redeploying without the node failed", e); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/filter/StateFilter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/filter/StateFilter.java index a605835c11c..e7368e7770c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/filter/StateFilter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/filter/StateFilter.java @@ -48,5 +48,4 @@ public class StateFilter extends NodeFilter { return new StateFilter(HostFilter.split(states).stream().map(Node.State::valueOf).collect(Collectors.toSet()), next); } - } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java index 6805862baf3..813941de1eb 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java @@ -228,13 +228,14 @@ public class CuratorDatabaseClient { private String toDir(Node.State state) { switch (state) { - case provisioned: return "provisioned"; - case ready: return "ready"; - case reserved: return "reserved"; case active: return "allocated"; // legacy name - case inactive: return "deallocated"; // legacy name case dirty: return "dirty"; case failed: return "failed"; + case inactive: return "deallocated"; // legacy name + case parked : return "parked"; + case provisioned: return "provisioned"; + case ready: return "ready"; + case reserved: return "reserved"; default: throw new RuntimeException("Node state " + state + " does not map to a directory name"); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java index 004cceee7cb..e4b745bcfd2 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java @@ -212,7 +212,7 @@ public class NodeSerializer { switch (typeString) { case nodeTypeTenant : return Node.Type.tenant; case nodeTypeHost : return Node.Type.host; - // TODO: Remove this when all data is converted + // TODO: Remove this when 6.13 is released everywhere case "" : return Node.Type.tenant; } throw new IllegalArgumentException("Unknown node type '" + typeString + "'"); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeStateSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeStateSerializer.java index 321a75421a2..fa74605b32f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeStateSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeStateSerializer.java @@ -28,6 +28,7 @@ public class NodeStateSerializer { addMapping(Node.State.dirty, "dirty"); addMapping(Node.State.failed, "failed"); addMapping(Node.State.inactive, "inactive"); + addMapping(Node.State.parked, "parked"); addMapping(Node.State.provisioned, "provisioned"); addMapping(Node.State.ready, "ready"); addMapping(Node.State.reserved, "reserved"); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodesApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodesApiHandler.java index 8c388fbd4db..6b03fcaa3eb 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodesApiHandler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodesApiHandler.java @@ -99,12 +99,16 @@ public class NodesApiHandler extends LoggingRequestHandler { nodeRepository.fail(lastElement(path)); return new MessageResponse("Moved " + lastElement(path) + " to failed"); } + else if (path.startsWith("/nodes/v2/state/parked/")) { + nodeRepository.park(lastElement(path)); + return new MessageResponse("Moved " + lastElement(path) + " to parked"); + } else if (path.startsWith("/nodes/v2/state/dirty/")) { nodeRepository.deallocate(lastElement(path)); return new MessageResponse("Moved " + lastElement(path) + " to dirty"); } else if (path.startsWith("/nodes/v2/state/active/")) { - nodeRepository.unfail(lastElement(path)); + nodeRepository.reactivate(lastElement(path)); return new MessageResponse("Moved " + lastElement(path) + " to active"); } else { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainerTest.java index 636d56da1df..e034b185a02 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainerTest.java @@ -56,43 +56,42 @@ public class ApplicationMaintainerTest { // Create applications fixture.activate(); - // Fail some nodes + // Fail and park some nodes nodeRepository.fail(nodeRepository.getNodes(fixture.app1).get(3).hostname()); nodeRepository.fail(nodeRepository.getNodes(fixture.app2).get(0).hostname()); - nodeRepository.fail(nodeRepository.getNodes(fixture.app2).get(4).hostname()); + nodeRepository.park(nodeRepository.getNodes(fixture.app2).get(4).hostname()); int failedInApp1 = 1; - int failedInApp2 = 2; + int failedOrParkedInApp2 = 2; assertEquals(fixture.wantedNodesApp1 - failedInApp1, nodeRepository.getNodes(fixture.app1, Node.State.active).size()); - assertEquals(fixture.wantedNodesApp2 - failedInApp2, nodeRepository.getNodes(fixture.app2, Node.State.active).size()); - assertEquals(failedInApp1 + failedInApp2, nodeRepository.getNodes(Node.Type.tenant, Node.State.failed).size()); + assertEquals(fixture.wantedNodesApp2 - failedOrParkedInApp2, nodeRepository.getNodes(fixture.app2, Node.State.active).size()); + assertEquals(failedInApp1 + failedOrParkedInApp2, nodeRepository.getNodes(Node.Type.tenant, Node.State.failed, Node.State.parked).size()); assertEquals(3, nodeRepository.getNodes(Node.Type.tenant, Node.State.ready).size()); assertEquals(2, nodeRepository.getNodes(Node.Type.host, Node.State.ready).size()); - // Cause maintenance deployment which will allocate replacement nodes fixture.runApplicationMaintainer(); assertEquals(fixture.wantedNodesApp1, nodeRepository.getNodes(fixture.app1, Node.State.active).size()); assertEquals(fixture.wantedNodesApp2, nodeRepository.getNodes(fixture.app2, Node.State.active).size()); assertEquals(0, nodeRepository.getNodes(Node.Type.tenant, Node.State.ready).size()); - // Unfail the previously failed nodes - nodeRepository.unfail(nodeRepository.getNodes(Node.Type.tenant, Node.State.failed).get(0).hostname()); - nodeRepository.unfail(nodeRepository.getNodes(Node.Type.tenant, Node.State.failed).get(0).hostname()); - nodeRepository.unfail(nodeRepository.getNodes(Node.Type.tenant, Node.State.failed).get(0).hostname()); - int unfailedInApp1 = 1; - int unfailedInApp2 = 2; + // Reactivate the previously failed nodes + nodeRepository.reactivate(nodeRepository.getNodes(Node.Type.tenant, Node.State.failed).get(0).hostname()); + nodeRepository.reactivate(nodeRepository.getNodes(Node.Type.tenant, Node.State.failed).get(0).hostname()); + nodeRepository.reactivate(nodeRepository.getNodes(Node.Type.tenant, Node.State.parked).get(0).hostname()); + int reactivatedInApp1 = 1; + int reactivatedInApp2 = 2; assertEquals(0, nodeRepository.getNodes(Node.Type.tenant, Node.State.failed).size()); - assertEquals(fixture.wantedNodesApp1 + unfailedInApp1, nodeRepository.getNodes(fixture.app1, Node.State.active).size()); - assertEquals(fixture.wantedNodesApp2 + unfailedInApp2, nodeRepository.getNodes(fixture.app2, Node.State.active).size()); - assertEquals("The unfailed nodes are now active but not part of the application", + assertEquals(fixture.wantedNodesApp1 + reactivatedInApp1, nodeRepository.getNodes(fixture.app1, Node.State.active).size()); + assertEquals(fixture.wantedNodesApp2 + reactivatedInApp2, nodeRepository.getNodes(fixture.app2, Node.State.active).size()); + assertEquals("The reactivated nodes are now active but not part of the application", 0, fixture.getNodes(Node.State.active).retired().size()); // Cause maintenance deployment which will update the applications with the re-activated nodes fixture.runApplicationMaintainer(); assertEquals("Superflous content nodes are retired", - unfailedInApp2, fixture.getNodes(Node.State.active).retired().size()); + reactivatedInApp2, fixture.getNodes(Node.State.active).retired().size()); assertEquals("Superflous container nodes are deactivated (this makes little point for container nodes)", - unfailedInApp1, fixture.getNodes(Node.State.inactive).size()); + reactivatedInApp1, fixture.getNodes(Node.State.inactive).size()); } private void createReadyNodes(int count, NodeRepository nodeRepository, NodeFlavors nodeFlavors) { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/ProvisionMetricsTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/ProvisionMetricsTest.java index ac6532b1b4a..0fe507edbe5 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/ProvisionMetricsTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/ProvisionMetricsTest.java @@ -35,6 +35,7 @@ public class ProvisionMetricsTest { final Map<String, Number> expectedMetrics = new HashMap<>(); expectedMetrics.put("hostedVespa.provisionedHosts", 1); + expectedMetrics.put("hostedVespa.parkedHosts", 0); expectedMetrics.put("hostedVespa.readyHosts", 0); expectedMetrics.put("hostedVespa.reservedHosts", 0); expectedMetrics.put("hostedVespa.activeHosts", 0); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/RestApiTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/RestApiTest.java index fb7209fb308..c8993557cb3 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/RestApiTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/RestApiTest.java @@ -101,12 +101,12 @@ public class RestApiTest { new byte[0], Request.Method.PUT), "{\"message\":\"Moved host8.yahoo.com to active\"}"); - // PUT a node in failed ... - assertResponse(new Request("http://localhost:8080/nodes/v2/state/failed/host8.yahoo.com", + // PUT a node in parked ... + assertResponse(new Request("http://localhost:8080/nodes/v2/state/parked/host8.yahoo.com", new byte[0], Request.Method.PUT), - "{\"message\":\"Moved host8.yahoo.com to failed\"}"); + "{\"message\":\"Moved host8.yahoo.com to parked\"}"); assertResponseContains(new Request("http://localhost:8080()/nodes/v2/node/host8.yahoo.com"), - "\"state\":\"failed\""); + "\"state\":\"parked\""); // ... and delete it assertResponse(new Request("http://localhost:8080/nodes/v2/node/host8.yahoo.com", new byte[0], Request.Method.DELETE), diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/responses/states-recursive.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/responses/states-recursive.json index 9dd85385f4c..02d8e473f27 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/responses/states-recursive.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/responses/states-recursive.json @@ -43,6 +43,11 @@ "nodes": [ @include(node5.json) ] + }, + "parked": { + "url": "http://localhost:8080/nodes/v2/state/parked", + "nodes": [ + ] } } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/responses/states.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/responses/states.json index b2d7354a6c9..4b2de7532dd 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/responses/states.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/responses/states.json @@ -20,6 +20,9 @@ }, "failed": { "url": "http://localhost:8080/nodes/v2/state/failed" + }, + "parked": { + "url": "http://localhost:8080/nodes/v2/state/parked" } } }
\ No newline at end of file |