diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-09-02 10:04:06 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-09-02 10:04:06 +0200 |
commit | e73f34f1fa1c6a778520c3f0a8ae70b1698bca5a (patch) | |
tree | a85707602338c79ec7d9d1791ebd4fe177ee0ad2 /node-repository | |
parent | 68912b05d894967282e6c063afc2c83435cab86e (diff) |
Persist node requet times
Diffstat (limited to 'node-repository')
6 files changed, 77 insertions, 47 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java index 0456f6071c6..f005aef55b0 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java @@ -197,7 +197,7 @@ public final class Node { } /** Returns a copy of this node with the given history. */ - private Node setHistory(History history) { + public Node setHistory(History history) { return new Node(openStackId, hostname, parentHostname, configuration, status, state, allocation, history, type); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 702d86419b7..8866698ca3d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -48,6 +48,9 @@ public class NodeFailer extends Maintainer { private final Duration downTimeLimit; private final Clock clock; private final Orchestrator orchestrator; + + private final Duration nodeRequestInterval = Duration.ofMinutes(10); + private final Instant constructionTime; public NodeFailer(Deployer deployer, HostLivenessTracker hostLivenessTracker, ServiceMonitor serviceMonitor, NodeRepository nodeRepository, @@ -60,6 +63,7 @@ public class NodeFailer extends Maintainer { this.downTimeLimit = downTimeLimit; this.clock = clock; this.orchestrator = orchestrator; + constructionTime = clock.instant(); } private static Duration min(Duration d1, Duration d2) { @@ -69,10 +73,11 @@ public class NodeFailer extends Maintainer { @Override protected void maintain() { // Ready nodes - for (Node node : readyNodesWithHardwareFailure()) - nodeRepository().fail(node.hostname()); + updateNodeLivenessEventsForReadyNodes(); for (Node node : readyNodesWhichAreDead()) nodeRepository().fail(node.hostname()); + for (Node node : readyNodesWithHardwareFailure()) + nodeRepository().fail(node.hostname()); // Active nodes for (Node node : determineActiveNodeDownStatus()) { @@ -80,26 +85,38 @@ public class NodeFailer extends Maintainer { if (graceTimeEnd.isBefore(clock.instant()) && ! applicationSuspended(node)) failActive(node); } - - } - - private List<Node> readyNodesWithHardwareFailure() { - return nodeRepository().getNodes(Node.Type.tenant, Node.State.ready).stream() - .filter(node -> node.status().hardwareFailure().isPresent()) - .collect(Collectors.toList()); } - private List<Node> readyNodesWhichAreDead() { + private void updateNodeLivenessEventsForReadyNodes() { + // Update node last request events through ZooKeeper to collect request to all config servers. + // We do this here ("lazily") to avoid writing to zk for each config request. + try (Mutex lock = nodeRepository().lockUnallocated()) { + for (Node node : nodeRepository().getNodes(Node.Type.tenant, Node.State.ready)) { + Optional<Instant> lastLocalRequest = hostLivenessTracker.lastRequestFrom(node.hostname()); + if ( ! lastLocalRequest.isPresent()) continue; + + Optional<History.Event> recordedRequest = node.history().event(History.Event.Type.requested); + if ( ! recordedRequest.isPresent() || recordedRequest.get().at().isBefore(lastLocalRequest.get())) { + History updatedHistory = node.history().record(new History.Event(History.Event.Type.requested, + lastLocalRequest.get())); + nodeRepository().write(node.setHistory(updatedHistory)); + } + } + } + } + + private List<Node> readyNodesWhichAreDead() { + // Allow requests some time to be registered in case all config servers have been down + if (constructionTime.isAfter(clock.instant().minus(nodeRequestInterval).minus(nodeRequestInterval) )) + return Collections.emptyList(); + // Nodes are taken as dead if they have not made a config request since this instant. // Add 10 minutes to the down time limit to allow nodes to make a request that infrequently. - Instant oldestAcceptableRequestTime = clock.instant().minus(downTimeLimit).minus(Duration.ofMinutes(10)); + Instant oldestAcceptableRequestTime = clock.instant().minus(downTimeLimit).minus(nodeRequestInterval); - if ( ! hostLivenessTracker.remembersRequestsSince().isBefore(oldestAcceptableRequestTime)) - return Collections.emptyList(); // we haven't tracked long enough to tell if nodes are dead - return nodeRepository().getNodes(Node.Type.tenant, Node.State.ready).stream() .filter(node -> wasMadeReadyBefore(oldestAcceptableRequestTime, node)) - .filter(node -> ! hasRecordedResponseAfter(oldestAcceptableRequestTime, node)) + .filter(node -> ! hasRecordedRequestAfter(oldestAcceptableRequestTime, node)) .collect(Collectors.toList()); } @@ -108,13 +125,19 @@ public class NodeFailer extends Maintainer { if ( ! readiedEvent.isPresent()) return false; return readiedEvent.get().at().isBefore(instant); } - - private boolean hasRecordedResponseAfter(Instant instant, Node node) { - Optional<Instant> lastResponse = hostLivenessTracker.lastRequestFrom(node.hostname()); - if ( ! lastResponse.isPresent()) return false; - return lastResponse.get().isAfter(instant); + + private boolean hasRecordedRequestAfter(Instant instant, Node node) { + Optional<History.Event> lastRequest = node.history().event(History.Event.Type.requested); + if ( ! lastRequest.isPresent()) return false; + return lastRequest.get().at().isAfter(instant); } - + + private List<Node> readyNodesWithHardwareFailure() { + return nodeRepository().getNodes(Node.Type.tenant, Node.State.ready).stream() + .filter(node -> node.status().hardwareFailure().isPresent()) + .collect(Collectors.toList()); + } + private boolean applicationSuspended(Node node) { try { return orchestrator.getApplicationInstanceStatus(node.allocation().get().owner()) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java index 42134d082b7..488a3e6bc77 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java @@ -79,6 +79,16 @@ public class History { /** Returns the empty history */ public static History empty() { return new History(Collections.emptyList()); } + @Override + public String toString() { + if (events.isEmpty()) return "history: (empty)"; + StringBuilder b = new StringBuilder("history: "); + for (Event e : events.values()) + b.append(e).append(", "); + b.setLength(b.length() -2); // remove last comma + return b.toString(); + } + /** An event which may happen to a node */ public static class Event { @@ -96,10 +106,19 @@ public class History { /** Returns the instant this even took place */ public Instant at() { return at; } - public enum Type { readied, reserved, activated, retired, deactivated, failed, deallocated, down } + public enum Type { + // State move events + readied, reserved, activated, deactivated, failed, deallocated, + // An active node was retired + retired, + // An active node went down according to the service monitor + down, + // A node made a config request, indicating it is live + requested + } @Override - public String toString() { return type + " event at " + at; } + public String toString() { return "'" + type + "' event at " + at; } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java index bf442d4f164..0f0f8454a6f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java @@ -170,8 +170,8 @@ public class CuratorDatabaseClient { private History newNodeHistory(Node node, Node.State toState) { History history = node.history(); - // wipe history to avoid expiring based on events under the previous allocation - if (toState == Node.State.ready) + // wipe history when a node *becomes* ready to avoid expiring based on events under the previous allocation + if (node.state() != Node.State.ready && toState == Node.State.ready) history = History.empty(); return history.recordStateTransition(node.state(), toState, clock.instant()); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java index 84012dd63b7..73d6174c2df 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java @@ -242,6 +242,7 @@ public class NodeSerializer { case "failed" : return History.Event.Type.failed; case "deallocated" : return History.Event.Type.deallocated; case "down" : return History.Event.Type.down; + case "requested" : return History.Event.Type.requested; } throw new IllegalArgumentException("Unknown node event type '" + eventTypeString + "'"); } @@ -255,6 +256,7 @@ public class NodeSerializer { case failed : return "failed"; case deallocated : return "deallocated"; case down : return "down"; + case requested: return "requested"; } throw new IllegalArgumentException("Serialized form of '" + nodeEventType + "' not defined"); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index d1f37dccd16..d613b21052a 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -119,7 +119,11 @@ public class NodeFailerTest { serviceMonitor = new ServiceMonitorStub(apps, nodeRepository); orchestrator = new OrchestratorMock(); - failer = new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, DOWNTIME_LIMIT_ONE_HOUR, clock, orchestrator); + failer = createFailer(); + } + + private NodeFailer createFailer() { + return new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, DOWNTIME_LIMIT_ONE_HOUR, clock, orchestrator); } @Test @@ -199,7 +203,7 @@ public class NodeFailerTest { allNodesMakeAConfigRequestExcept(); // the system goes down and do not have updated information when coming back clock.advance(Duration.ofMinutes(120)); - hostLivenessTracker.setConstructedNow(); + failer = createFailer(); serviceMonitor.setStatusIsKnown(false); failer.run(); // due to this, nothing is failed @@ -264,15 +268,7 @@ public class NodeFailerTest { assertEquals( 2, nodeRepository.getNodes(Node.Type.tenant, Node.State.ready).size()); assertEquals( 2, nodeRepository.getNodes(Node.Type.tenant, Node.State.failed).size()); - // Another ready node die but we restart so we don't have enough information - clock.advance(Duration.ofMinutes(180)); - hostLivenessTracker.setConstructedNow(); - allNodesMakeAConfigRequestExcept(ready.get(0), ready.get(2), ready.get(3)); - failer.run(); - assertEquals( 2, nodeRepository.getNodes(Node.Type.tenant, Node.State.ready).size()); - assertEquals( 2, nodeRepository.getNodes(Node.Type.tenant, Node.State.failed).size()); - - // Now we get enough information + // Another ready node die clock.advance(Duration.ofMinutes(180)); allNodesMakeAConfigRequestExcept(ready.get(0), ready.get(2), ready.get(3)); failer.run(); @@ -331,22 +327,12 @@ public class NodeFailerTest { private static class TestHostLivenessTracker implements HostLivenessTracker { private final Clock clock; - private Instant constructionTime; private final Map<String, Instant> lastRequestFromHost = new HashMap<>(); public TestHostLivenessTracker(Clock clock) { this.clock = clock; - this.constructionTime = clock.instant(); } - public void setConstructedNow() { - constructionTime = clock.instant(); - lastRequestFromHost.clear(); - } - - @Override - public Instant remembersRequestsSince() { return constructionTime; } - @Override public void receivedRequestFrom(String hostname) { lastRequestFromHost.put(hostname, clock.instant()); |