summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@yahoo-inc.com>2017-03-03 13:35:58 +0100
committerJon Bratseth <bratseth@yahoo-inc.com>2017-03-03 13:35:58 +0100
commit565afee03968a0664ff12695f04897f6ef0a0669 (patch)
tree1f929b54bfaee8fdeca558ac921c60ed928de17b /node-repository
parentca866e60ef9c61b0725676a48514d5be44ab0266 (diff)
Log reason when failing nodes
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java47
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java14
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java14
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodesApiHandler.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainerTest.java4
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java6
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/NodeTypeProvisioningTest.java6
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java2
10 files changed, 51 insertions, 48 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java
index 031170aa488..c57cbde79bb 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java
@@ -92,7 +92,7 @@ public class NodeRepository extends AbstractComponent {
// read and write all nodes to make sure they are stored in the latest version of the serialized format
for (Node.State state : Node.State.values())
- zkClient.writeTo(state, zkClient.getNodes(state));
+ zkClient.writeTo(state, zkClient.getNodes(state), Optional.empty());
}
// ---------------- Query API ----------------------------------------------------------------
@@ -264,7 +264,7 @@ public class NodeRepository extends AbstractComponent {
if (node.state() != Node.State.dirty)
throw new IllegalArgumentException("Can not set " + node + " ready. It is not dirty.");
try (Mutex lock = lockUnallocated()) {
- return zkClient.writeTo(Node.State.ready, nodes);
+ return zkClient.writeTo(Node.State.ready, nodes, Optional.empty());
}
}
@@ -277,11 +277,11 @@ public class NodeRepository extends AbstractComponent {
}
/** Reserve nodes. This method does <b>not</b> lock the node repository */
- public List<Node> reserve(List<Node> nodes) { return zkClient.writeTo(Node.State.reserved, nodes); }
+ public List<Node> reserve(List<Node> nodes) { return zkClient.writeTo(Node.State.reserved, nodes, Optional.empty()); }
/** Activate nodes. This method does <b>not</b> lock the node repository */
public List<Node> activate(List<Node> nodes, NestedTransaction transaction) {
- return zkClient.writeTo(Node.State.active, nodes, transaction);
+ return zkClient.writeTo(Node.State.active, nodes, Optional.empty(), transaction);
}
/**
@@ -303,7 +303,8 @@ public class NodeRepository extends AbstractComponent {
try (Mutex lock = lock(application)) {
zkClient.writeTo(Node.State.inactive,
zkClient.getNodes(application, Node.State.reserved, Node.State.active),
- transaction);
+ Optional.empty(), transaction
+ );
}
}
@@ -313,12 +314,12 @@ public class NodeRepository extends AbstractComponent {
* This method does <b>not</b> lock
*/
public List<Node> deactivate(List<Node> nodes, NestedTransaction transaction) {
- return zkClient.writeTo(Node.State.inactive, nodes, transaction);
+ return zkClient.writeTo(Node.State.inactive, nodes, Optional.empty(), transaction);
}
/** Move nodes to the dirty state */
public List<Node> setDirty(List<Node> nodes) {
- return performOn(NodeListFilter.from(nodes), node -> zkClient.writeTo(Node.State.dirty, node));
+ return performOn(NodeListFilter.from(nodes), node -> zkClient.writeTo(Node.State.dirty, node, Optional.empty()));
}
/**
@@ -342,8 +343,8 @@ public class NodeRepository extends AbstractComponent {
* @return the node in its new state
* @throws NotFoundException if the node is not found
*/
- public Node fail(String hostname) {
- return move(hostname, Node.State.failed);
+ public Node fail(String hostname, String reason) {
+ return move(hostname, Node.State.failed, Optional.of(reason));
}
/**
@@ -351,8 +352,8 @@ public class NodeRepository extends AbstractComponent {
*
* @return List of all the failed nodes in their new state
*/
- public List<Node> failRecursively(String hostname) {
- return moveRecursively(hostname, Node.State.failed);
+ public List<Node> failRecursively(String hostname, String reason) {
+ return moveRecursively(hostname, Node.State.failed, Optional.of(reason));
}
/**
@@ -362,7 +363,7 @@ public class NodeRepository extends AbstractComponent {
* @throws NotFoundException if the node is not found
*/
public Node park(String hostname) {
- return move(hostname, Node.State.parked);
+ return move(hostname, Node.State.parked, Optional.empty());
}
/**
@@ -371,7 +372,7 @@ public class NodeRepository extends AbstractComponent {
* @return List of all the parked nodes in their new state
*/
public List<Node> parkRecursively(String hostname) {
- return moveRecursively(hostname, Node.State.parked);
+ return moveRecursively(hostname, Node.State.parked, Optional.empty());
}
/**
@@ -381,30 +382,30 @@ public class NodeRepository extends AbstractComponent {
* @throws NotFoundException if the node is not found
*/
public Node reactivate(String hostname) {
- return move(hostname, Node.State.active);
+ return move(hostname, Node.State.active, Optional.empty());
}
- private List<Node> moveRecursively(String hostname, Node.State toState) {
+ private List<Node> moveRecursively(String hostname, Node.State toState, Optional<String> reason) {
List<Node> moved = getChildNodes(hostname).stream()
- .map(child -> move(child, toState))
+ .map(child -> move(child, toState, reason))
.collect(Collectors.toList());
- moved.add(move(hostname, toState));
+ moved.add(move(hostname, toState, reason));
return moved;
}
- private Node move(String hostname, Node.State toState) {
+ private Node move(String hostname, Node.State toState, Optional<String> reason) {
Node node = getNode(hostname).orElseThrow(() ->
new NotFoundException("Could not move " + hostname + " to " + toState + ": Node not found"));
- return move(node, toState);
+ return move(node, toState, reason);
}
- private Node move(Node node, Node.State toState) {
+ private Node move(Node node, Node.State toState, Optional<String> reason) {
if (toState == Node.State.active && !node.allocation().isPresent()) {
throw new IllegalArgumentException("Could not set " + node.hostname() + " active. It has no allocation.");
}
try (Mutex lock = lock(node)) {
- return zkClient.writeTo(toState, node);
+ return zkClient.writeTo(toState, node, reason);
}
}
@@ -444,7 +445,7 @@ public class NodeRepository extends AbstractComponent {
*
* @return the written node for convenience
*/
- public Node write(Node node) { return zkClient.writeTo(node.state(), node); }
+ public Node write(Node node) { return zkClient.writeTo(node.state(), node, Optional.empty()); }
/**
* Writes these nodes after they have changed some internal state but NOT changed their state field.
@@ -461,7 +462,7 @@ public class NodeRepository extends AbstractComponent {
if ( node.state() != state)
throw new IllegalArgumentException("Multiple states: " + node.state() + " and " + state);
}
- return zkClient.writeTo(state, nodes);
+ return zkClient.writeTo(state, nodes, Optional.empty());
}
/**
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java
index 4d106098ab5..2178adf108d 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java
@@ -33,7 +33,7 @@ public class DirtyExpirer extends Expirer {
@Override
protected void expire(List<Node> expired) {
for (Node expiredNode : expired.stream().collect(Collectors.toList()))
- nodeRepository.fail(expiredNode.hostname());
+ nodeRepository.fail(expiredNode.hostname(), "Node is stuck in dirty");
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 5714faeda9a..b629e7f1517 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -75,16 +75,16 @@ public class NodeFailer extends Maintainer {
for (Node node : readyNodesWhichAreDead( )) {
// Docker hosts and nodes do not run Vespa services
if (node.flavor().getType() == Flavor.Type.DOCKER_CONTAINER || node.type() == NodeType.host) continue;
- nodeRepository().fail(node.hostname());
+ nodeRepository().fail(node.hostname(), "Not receiving config requests from node");
}
for (Node node : readyNodesWithHardwareFailure())
- nodeRepository().fail(node.hostname());
+ nodeRepository().fail(node.hostname(), "Node has hardware failure");
// Active nodes
for (Node node : determineActiveNodeDownStatus()) {
Instant graceTimeEnd = node.history().event(History.Event.Type.down).get().at().plus(downTimeLimit);
if (graceTimeEnd.isBefore(clock.instant()) && ! applicationSuspended(node) && failAllowedFor(node.type()))
- failActive(node);
+ failActive(node, "Node is down");
}
}
@@ -215,7 +215,7 @@ public class NodeFailer extends Maintainer {
*
* @return whether node was successfully failed
*/
- private boolean failActive(Node node) {
+ private boolean failActive(Node node, String reason) {
Optional<Deployment> deployment =
deployer.deployFromLocalActive(node.allocation().get().owner(), Duration.ofMinutes(30));
if ( ! deployment.isPresent()) return false; // this will be done at another config server
@@ -226,14 +226,14 @@ public class NodeFailer extends Maintainer {
boolean allTenantNodesFailedOutSuccessfully = true;
for (Node failingTenantNode : nodeRepository().getChildNodes(node.hostname())) {
if (failingTenantNode.state() == Node.State.active) {
- allTenantNodesFailedOutSuccessfully &= failActive(failingTenantNode);
+ allTenantNodesFailedOutSuccessfully &= failActive(failingTenantNode, reason);
} else {
- nodeRepository().fail(failingTenantNode.hostname());
+ nodeRepository().fail(failingTenantNode.hostname(), reason);
}
}
if (! allTenantNodesFailedOutSuccessfully) return false;
- node = nodeRepository().fail(node.hostname());
+ node = nodeRepository().fail(node.hostname(), reason);
try {
deployment.get().prepare();
deployment.get().activate();
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java
index 31ad57a4a77..759bb89de94 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java
@@ -123,15 +123,15 @@ public class CuratorDatabaseClient {
* @param nodes the list of nodes to write
* @return the nodes in their persisted state
*/
- public List<Node> writeTo(Node.State toState, List<Node> nodes) {
+ public List<Node> writeTo(Node.State toState, List<Node> nodes, Optional<String> reason) {
try (NestedTransaction nestedTransaction = new NestedTransaction()) {
- List<Node> writtenNodes = writeTo(toState, nodes, nestedTransaction);
+ List<Node> writtenNodes = writeTo(toState, nodes, reason, nestedTransaction);
nestedTransaction.commit();
return writtenNodes;
}
}
- public Node writeTo(Node.State toState, Node node) {
- return writeTo(toState, Collections.singletonList(node)).get(0);
+ public Node writeTo(Node.State toState, Node node, Optional<String> reason) {
+ return writeTo(toState, Collections.singletonList(node), reason).get(0);
}
/**
@@ -140,10 +140,12 @@ public class CuratorDatabaseClient {
*
* @param toState the state to write the nodes to
* @param nodes the list of nodes to write
+ * @param reason an optional reason to be logged, for humans
* @param transaction the transaction to which write operations are added by this
* @return the nodes in their state as it will be written if committed
*/
- public List<Node> writeTo(Node.State toState, List<Node> nodes, NestedTransaction transaction) {
+ public List<Node> writeTo(Node.State toState, List<Node> nodes,
+ Optional<String> reason, NestedTransaction transaction) {
if (nodes.isEmpty()) return nodes;
List<Node> writtenNodes = new ArrayList<>(nodes.size());
@@ -165,7 +167,7 @@ public class CuratorDatabaseClient {
transaction.onCommitted(() -> { // schedule logging on commit of nodes which changed state
for (Node node : nodes) {
if (toState != node.state())
- log.log(LogLevel.INFO, "Moved to " + toState + ": " + node);
+ log.log(LogLevel.INFO, "Moved to " + toState + ": " + node + (reason.isPresent() ? ": " + reason.get() : ""));
}
});
return writtenNodes;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodesApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodesApiHandler.java
index 707db02e6e5..0370ac4327a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodesApiHandler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodesApiHandler.java
@@ -100,7 +100,7 @@ public class NodesApiHandler extends LoggingRequestHandler {
return new MessageResponse("Moved " + lastElement(path) + " to ready");
}
else if (path.startsWith("/nodes/v2/state/failed/")) {
- List<Node> failedNodes = nodeRepository.failRecursively(lastElement(path));
+ List<Node> failedNodes = nodeRepository.failRecursively(lastElement(path), "Failing for unit testing");
String failedHostnames = failedNodes.stream().map(Node::hostname).sorted().collect(Collectors.joining(", "));
return new MessageResponse("Moved " + failedHostnames + " to failed");
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java
index 1d3eeca32f8..34507df749f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java
@@ -88,7 +88,7 @@ public class MockNodeRepository extends NodeRepository {
nodes.remove(7);
nodes = setDirty(nodes);
setReady(nodes);
- fail("host5.yahoo.com");
+ fail("host5.yahoo.com", "Failing to unit test");
setDirty("host55.yahoo.com");
ApplicationId app1 = ApplicationId.from(TenantName.from("tenant1"), ApplicationName.from("application1"), InstanceName.from("instance1"));
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainerTest.java
index 549ea90a4e2..c976c7f1ee2 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainerTest.java
@@ -67,8 +67,8 @@ public class ApplicationMaintainerTest {
fixture.activate();
// Fail and park some nodes
- nodeRepository.fail(nodeRepository.getNodes(fixture.app1).get(3).hostname());
- nodeRepository.fail(nodeRepository.getNodes(fixture.app2).get(0).hostname());
+ nodeRepository.fail(nodeRepository.getNodes(fixture.app1).get(3).hostname(), "Failing to unit test");
+ nodeRepository.fail(nodeRepository.getNodes(fixture.app2).get(0).hostname(), "Failing to unit test");
nodeRepository.park(nodeRepository.getNodes(fixture.app2).get(4).hostname());
int failedInApp1 = 1;
int failedOrParkedInApp2 = 2;
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java
index 02833d3e229..e73ca518beb 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java
@@ -121,9 +121,9 @@ public class FailedExpirerTest {
assertEquals(3, nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
// Fail the nodes
- nodeRepository.fail("node1");
- nodeRepository.fail("node2");
- nodeRepository.fail("node3");
+ nodeRepository.fail("node1", "Failing to unit test");
+ nodeRepository.fail("node2", "Failing to unit test");
+ nodeRepository.fail("node3", "Failing to unit test");
assertEquals(3, nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
// Failure times out
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/NodeTypeProvisioningTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/NodeTypeProvisioningTest.java
index fd6d5e8e455..2b344930e07 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/NodeTypeProvisioningTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/NodeTypeProvisioningTest.java
@@ -70,9 +70,9 @@ public class NodeTypeProvisioningTest {
{ // Remove 3 proxies then redeploy
List<Node> nodes = tester.nodeRepository().getNodes(NodeType.proxy, Node.State.active);
- tester.nodeRepository().fail(nodes.get(0).hostname());
- tester.nodeRepository().fail(nodes.get(1).hostname());
- tester.nodeRepository().fail(nodes.get(5).hostname());
+ tester.nodeRepository().fail(nodes.get(0).hostname(), "Failing to unit test");
+ tester.nodeRepository().fail(nodes.get(1).hostname(), "Failing to unit test");
+ tester.nodeRepository().fail(nodes.get(5).hostname(), "Failing to unit test");
List<HostSpec> hosts = deployProxies(application, tester);
assertEquals(10, hosts.size());
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java
index 26b1b45b9c6..0b6e1c872bd 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java
@@ -173,7 +173,7 @@ public class ProvisioningTester implements AutoCloseable {
public void fail(HostSpec host) {
int beforeFailCount = nodeRepository.getNode(host.hostname(), Node.State.active).get().status().failCount();
- Node failedNode = nodeRepository.fail(host.hostname());
+ Node failedNode = nodeRepository.fail(host.hostname(), "Failing to unit test");
assertTrue(nodeRepository.getNodes(NodeType.tenant, Node.State.failed).contains(failedNode));
assertEquals(beforeFailCount + 1, failedNode.status().failCount());
}