diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2017-03-03 13:35:58 +0100 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2017-03-03 13:35:58 +0100 |
commit | 565afee03968a0664ff12695f04897f6ef0a0669 (patch) | |
tree | 1f929b54bfaee8fdeca558ac921c60ed928de17b /node-repository | |
parent | ca866e60ef9c61b0725676a48514d5be44ab0266 (diff) |
Log reason when failing nodes
Diffstat (limited to 'node-repository')
10 files changed, 51 insertions, 48 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java index 031170aa488..c57cbde79bb 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java @@ -92,7 +92,7 @@ public class NodeRepository extends AbstractComponent { // read and write all nodes to make sure they are stored in the latest version of the serialized format for (Node.State state : Node.State.values()) - zkClient.writeTo(state, zkClient.getNodes(state)); + zkClient.writeTo(state, zkClient.getNodes(state), Optional.empty()); } // ---------------- Query API ---------------------------------------------------------------- @@ -264,7 +264,7 @@ public class NodeRepository extends AbstractComponent { if (node.state() != Node.State.dirty) throw new IllegalArgumentException("Can not set " + node + " ready. It is not dirty."); try (Mutex lock = lockUnallocated()) { - return zkClient.writeTo(Node.State.ready, nodes); + return zkClient.writeTo(Node.State.ready, nodes, Optional.empty()); } } @@ -277,11 +277,11 @@ public class NodeRepository extends AbstractComponent { } /** Reserve nodes. This method does <b>not</b> lock the node repository */ - public List<Node> reserve(List<Node> nodes) { return zkClient.writeTo(Node.State.reserved, nodes); } + public List<Node> reserve(List<Node> nodes) { return zkClient.writeTo(Node.State.reserved, nodes, Optional.empty()); } /** Activate nodes. This method does <b>not</b> lock the node repository */ public List<Node> activate(List<Node> nodes, NestedTransaction transaction) { - return zkClient.writeTo(Node.State.active, nodes, transaction); + return zkClient.writeTo(Node.State.active, nodes, Optional.empty(), transaction); } /** @@ -303,7 +303,8 @@ public class NodeRepository extends AbstractComponent { try (Mutex lock = lock(application)) { zkClient.writeTo(Node.State.inactive, zkClient.getNodes(application, Node.State.reserved, Node.State.active), - transaction); + Optional.empty(), transaction + ); } } @@ -313,12 +314,12 @@ public class NodeRepository extends AbstractComponent { * This method does <b>not</b> lock */ public List<Node> deactivate(List<Node> nodes, NestedTransaction transaction) { - return zkClient.writeTo(Node.State.inactive, nodes, transaction); + return zkClient.writeTo(Node.State.inactive, nodes, Optional.empty(), transaction); } /** Move nodes to the dirty state */ public List<Node> setDirty(List<Node> nodes) { - return performOn(NodeListFilter.from(nodes), node -> zkClient.writeTo(Node.State.dirty, node)); + return performOn(NodeListFilter.from(nodes), node -> zkClient.writeTo(Node.State.dirty, node, Optional.empty())); } /** @@ -342,8 +343,8 @@ public class NodeRepository extends AbstractComponent { * @return the node in its new state * @throws NotFoundException if the node is not found */ - public Node fail(String hostname) { - return move(hostname, Node.State.failed); + public Node fail(String hostname, String reason) { + return move(hostname, Node.State.failed, Optional.of(reason)); } /** @@ -351,8 +352,8 @@ public class NodeRepository extends AbstractComponent { * * @return List of all the failed nodes in their new state */ - public List<Node> failRecursively(String hostname) { - return moveRecursively(hostname, Node.State.failed); + public List<Node> failRecursively(String hostname, String reason) { + return moveRecursively(hostname, Node.State.failed, Optional.of(reason)); } /** @@ -362,7 +363,7 @@ public class NodeRepository extends AbstractComponent { * @throws NotFoundException if the node is not found */ public Node park(String hostname) { - return move(hostname, Node.State.parked); + return move(hostname, Node.State.parked, Optional.empty()); } /** @@ -371,7 +372,7 @@ public class NodeRepository extends AbstractComponent { * @return List of all the parked nodes in their new state */ public List<Node> parkRecursively(String hostname) { - return moveRecursively(hostname, Node.State.parked); + return moveRecursively(hostname, Node.State.parked, Optional.empty()); } /** @@ -381,30 +382,30 @@ public class NodeRepository extends AbstractComponent { * @throws NotFoundException if the node is not found */ public Node reactivate(String hostname) { - return move(hostname, Node.State.active); + return move(hostname, Node.State.active, Optional.empty()); } - private List<Node> moveRecursively(String hostname, Node.State toState) { + private List<Node> moveRecursively(String hostname, Node.State toState, Optional<String> reason) { List<Node> moved = getChildNodes(hostname).stream() - .map(child -> move(child, toState)) + .map(child -> move(child, toState, reason)) .collect(Collectors.toList()); - moved.add(move(hostname, toState)); + moved.add(move(hostname, toState, reason)); return moved; } - private Node move(String hostname, Node.State toState) { + private Node move(String hostname, Node.State toState, Optional<String> reason) { Node node = getNode(hostname).orElseThrow(() -> new NotFoundException("Could not move " + hostname + " to " + toState + ": Node not found")); - return move(node, toState); + return move(node, toState, reason); } - private Node move(Node node, Node.State toState) { + private Node move(Node node, Node.State toState, Optional<String> reason) { if (toState == Node.State.active && !node.allocation().isPresent()) { throw new IllegalArgumentException("Could not set " + node.hostname() + " active. It has no allocation."); } try (Mutex lock = lock(node)) { - return zkClient.writeTo(toState, node); + return zkClient.writeTo(toState, node, reason); } } @@ -444,7 +445,7 @@ public class NodeRepository extends AbstractComponent { * * @return the written node for convenience */ - public Node write(Node node) { return zkClient.writeTo(node.state(), node); } + public Node write(Node node) { return zkClient.writeTo(node.state(), node, Optional.empty()); } /** * Writes these nodes after they have changed some internal state but NOT changed their state field. @@ -461,7 +462,7 @@ public class NodeRepository extends AbstractComponent { if ( node.state() != state) throw new IllegalArgumentException("Multiple states: " + node.state() + " and " + state); } - return zkClient.writeTo(state, nodes); + return zkClient.writeTo(state, nodes, Optional.empty()); } /** diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java index 4d106098ab5..2178adf108d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java @@ -33,7 +33,7 @@ public class DirtyExpirer extends Expirer { @Override protected void expire(List<Node> expired) { for (Node expiredNode : expired.stream().collect(Collectors.toList())) - nodeRepository.fail(expiredNode.hostname()); + nodeRepository.fail(expiredNode.hostname(), "Node is stuck in dirty"); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 5714faeda9a..b629e7f1517 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -75,16 +75,16 @@ public class NodeFailer extends Maintainer { for (Node node : readyNodesWhichAreDead( )) { // Docker hosts and nodes do not run Vespa services if (node.flavor().getType() == Flavor.Type.DOCKER_CONTAINER || node.type() == NodeType.host) continue; - nodeRepository().fail(node.hostname()); + nodeRepository().fail(node.hostname(), "Not receiving config requests from node"); } for (Node node : readyNodesWithHardwareFailure()) - nodeRepository().fail(node.hostname()); + nodeRepository().fail(node.hostname(), "Node has hardware failure"); // Active nodes for (Node node : determineActiveNodeDownStatus()) { Instant graceTimeEnd = node.history().event(History.Event.Type.down).get().at().plus(downTimeLimit); if (graceTimeEnd.isBefore(clock.instant()) && ! applicationSuspended(node) && failAllowedFor(node.type())) - failActive(node); + failActive(node, "Node is down"); } } @@ -215,7 +215,7 @@ public class NodeFailer extends Maintainer { * * @return whether node was successfully failed */ - private boolean failActive(Node node) { + private boolean failActive(Node node, String reason) { Optional<Deployment> deployment = deployer.deployFromLocalActive(node.allocation().get().owner(), Duration.ofMinutes(30)); if ( ! deployment.isPresent()) return false; // this will be done at another config server @@ -226,14 +226,14 @@ public class NodeFailer extends Maintainer { boolean allTenantNodesFailedOutSuccessfully = true; for (Node failingTenantNode : nodeRepository().getChildNodes(node.hostname())) { if (failingTenantNode.state() == Node.State.active) { - allTenantNodesFailedOutSuccessfully &= failActive(failingTenantNode); + allTenantNodesFailedOutSuccessfully &= failActive(failingTenantNode, reason); } else { - nodeRepository().fail(failingTenantNode.hostname()); + nodeRepository().fail(failingTenantNode.hostname(), reason); } } if (! allTenantNodesFailedOutSuccessfully) return false; - node = nodeRepository().fail(node.hostname()); + node = nodeRepository().fail(node.hostname(), reason); try { deployment.get().prepare(); deployment.get().activate(); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java index 31ad57a4a77..759bb89de94 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java @@ -123,15 +123,15 @@ public class CuratorDatabaseClient { * @param nodes the list of nodes to write * @return the nodes in their persisted state */ - public List<Node> writeTo(Node.State toState, List<Node> nodes) { + public List<Node> writeTo(Node.State toState, List<Node> nodes, Optional<String> reason) { try (NestedTransaction nestedTransaction = new NestedTransaction()) { - List<Node> writtenNodes = writeTo(toState, nodes, nestedTransaction); + List<Node> writtenNodes = writeTo(toState, nodes, reason, nestedTransaction); nestedTransaction.commit(); return writtenNodes; } } - public Node writeTo(Node.State toState, Node node) { - return writeTo(toState, Collections.singletonList(node)).get(0); + public Node writeTo(Node.State toState, Node node, Optional<String> reason) { + return writeTo(toState, Collections.singletonList(node), reason).get(0); } /** @@ -140,10 +140,12 @@ public class CuratorDatabaseClient { * * @param toState the state to write the nodes to * @param nodes the list of nodes to write + * @param reason an optional reason to be logged, for humans * @param transaction the transaction to which write operations are added by this * @return the nodes in their state as it will be written if committed */ - public List<Node> writeTo(Node.State toState, List<Node> nodes, NestedTransaction transaction) { + public List<Node> writeTo(Node.State toState, List<Node> nodes, + Optional<String> reason, NestedTransaction transaction) { if (nodes.isEmpty()) return nodes; List<Node> writtenNodes = new ArrayList<>(nodes.size()); @@ -165,7 +167,7 @@ public class CuratorDatabaseClient { transaction.onCommitted(() -> { // schedule logging on commit of nodes which changed state for (Node node : nodes) { if (toState != node.state()) - log.log(LogLevel.INFO, "Moved to " + toState + ": " + node); + log.log(LogLevel.INFO, "Moved to " + toState + ": " + node + (reason.isPresent() ? ": " + reason.get() : "")); } }); return writtenNodes; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodesApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodesApiHandler.java index 707db02e6e5..0370ac4327a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodesApiHandler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodesApiHandler.java @@ -100,7 +100,7 @@ public class NodesApiHandler extends LoggingRequestHandler { return new MessageResponse("Moved " + lastElement(path) + " to ready"); } else if (path.startsWith("/nodes/v2/state/failed/")) { - List<Node> failedNodes = nodeRepository.failRecursively(lastElement(path)); + List<Node> failedNodes = nodeRepository.failRecursively(lastElement(path), "Failing for unit testing"); String failedHostnames = failedNodes.stream().map(Node::hostname).sorted().collect(Collectors.joining(", ")); return new MessageResponse("Moved " + failedHostnames + " to failed"); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java index 1d3eeca32f8..34507df749f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java @@ -88,7 +88,7 @@ public class MockNodeRepository extends NodeRepository { nodes.remove(7); nodes = setDirty(nodes); setReady(nodes); - fail("host5.yahoo.com"); + fail("host5.yahoo.com", "Failing to unit test"); setDirty("host55.yahoo.com"); ApplicationId app1 = ApplicationId.from(TenantName.from("tenant1"), ApplicationName.from("application1"), InstanceName.from("instance1")); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainerTest.java index 549ea90a4e2..c976c7f1ee2 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainerTest.java @@ -67,8 +67,8 @@ public class ApplicationMaintainerTest { fixture.activate(); // Fail and park some nodes - nodeRepository.fail(nodeRepository.getNodes(fixture.app1).get(3).hostname()); - nodeRepository.fail(nodeRepository.getNodes(fixture.app2).get(0).hostname()); + nodeRepository.fail(nodeRepository.getNodes(fixture.app1).get(3).hostname(), "Failing to unit test"); + nodeRepository.fail(nodeRepository.getNodes(fixture.app2).get(0).hostname(), "Failing to unit test"); nodeRepository.park(nodeRepository.getNodes(fixture.app2).get(4).hostname()); int failedInApp1 = 1; int failedOrParkedInApp2 = 2; diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java index 02833d3e229..e73ca518beb 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java @@ -121,9 +121,9 @@ public class FailedExpirerTest { assertEquals(3, nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); // Fail the nodes - nodeRepository.fail("node1"); - nodeRepository.fail("node2"); - nodeRepository.fail("node3"); + nodeRepository.fail("node1", "Failing to unit test"); + nodeRepository.fail("node2", "Failing to unit test"); + nodeRepository.fail("node3", "Failing to unit test"); assertEquals(3, nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); // Failure times out diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/NodeTypeProvisioningTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/NodeTypeProvisioningTest.java index fd6d5e8e455..2b344930e07 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/NodeTypeProvisioningTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/NodeTypeProvisioningTest.java @@ -70,9 +70,9 @@ public class NodeTypeProvisioningTest { { // Remove 3 proxies then redeploy List<Node> nodes = tester.nodeRepository().getNodes(NodeType.proxy, Node.State.active); - tester.nodeRepository().fail(nodes.get(0).hostname()); - tester.nodeRepository().fail(nodes.get(1).hostname()); - tester.nodeRepository().fail(nodes.get(5).hostname()); + tester.nodeRepository().fail(nodes.get(0).hostname(), "Failing to unit test"); + tester.nodeRepository().fail(nodes.get(1).hostname(), "Failing to unit test"); + tester.nodeRepository().fail(nodes.get(5).hostname(), "Failing to unit test"); List<HostSpec> hosts = deployProxies(application, tester); assertEquals(10, hosts.size()); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java index 26b1b45b9c6..0b6e1c872bd 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java @@ -173,7 +173,7 @@ public class ProvisioningTester implements AutoCloseable { public void fail(HostSpec host) { int beforeFailCount = nodeRepository.getNode(host.hostname(), Node.State.active).get().status().failCount(); - Node failedNode = nodeRepository.fail(host.hostname()); + Node failedNode = nodeRepository.fail(host.hostname(), "Failing to unit test"); assertTrue(nodeRepository.getNodes(NodeType.tenant, Node.State.failed).contains(failedNode)); assertEquals(beforeFailCount + 1, failedNode.status().failCount()); } |