summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@yahoo-inc.com>2016-09-02 10:04:06 +0200
committerJon Bratseth <bratseth@yahoo-inc.com>2016-09-02 10:04:06 +0200
commite73f34f1fa1c6a778520c3f0a8ae70b1698bca5a (patch)
treea85707602338c79ec7d9d1791ebd4fe177ee0ad2 /node-repository
parent68912b05d894967282e6c063afc2c83435cab86e (diff)
Persist node requet times
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java65
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java23
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java28
6 files changed, 77 insertions, 47 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
index 0456f6071c6..f005aef55b0 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
@@ -197,7 +197,7 @@ public final class Node {
}
/** Returns a copy of this node with the given history. */
- private Node setHistory(History history) {
+ public Node setHistory(History history) {
return new Node(openStackId, hostname, parentHostname, configuration, status, state, allocation, history, type);
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 702d86419b7..8866698ca3d 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -48,6 +48,9 @@ public class NodeFailer extends Maintainer {
private final Duration downTimeLimit;
private final Clock clock;
private final Orchestrator orchestrator;
+
+ private final Duration nodeRequestInterval = Duration.ofMinutes(10);
+ private final Instant constructionTime;
public NodeFailer(Deployer deployer, HostLivenessTracker hostLivenessTracker,
ServiceMonitor serviceMonitor, NodeRepository nodeRepository,
@@ -60,6 +63,7 @@ public class NodeFailer extends Maintainer {
this.downTimeLimit = downTimeLimit;
this.clock = clock;
this.orchestrator = orchestrator;
+ constructionTime = clock.instant();
}
private static Duration min(Duration d1, Duration d2) {
@@ -69,10 +73,11 @@ public class NodeFailer extends Maintainer {
@Override
protected void maintain() {
// Ready nodes
- for (Node node : readyNodesWithHardwareFailure())
- nodeRepository().fail(node.hostname());
+ updateNodeLivenessEventsForReadyNodes();
for (Node node : readyNodesWhichAreDead())
nodeRepository().fail(node.hostname());
+ for (Node node : readyNodesWithHardwareFailure())
+ nodeRepository().fail(node.hostname());
// Active nodes
for (Node node : determineActiveNodeDownStatus()) {
@@ -80,26 +85,38 @@ public class NodeFailer extends Maintainer {
if (graceTimeEnd.isBefore(clock.instant()) && ! applicationSuspended(node))
failActive(node);
}
-
- }
-
- private List<Node> readyNodesWithHardwareFailure() {
- return nodeRepository().getNodes(Node.Type.tenant, Node.State.ready).stream()
- .filter(node -> node.status().hardwareFailure().isPresent())
- .collect(Collectors.toList());
}
- private List<Node> readyNodesWhichAreDead() {
+ private void updateNodeLivenessEventsForReadyNodes() {
+ // Update node last request events through ZooKeeper to collect request to all config servers.
+ // We do this here ("lazily") to avoid writing to zk for each config request.
+ try (Mutex lock = nodeRepository().lockUnallocated()) {
+ for (Node node : nodeRepository().getNodes(Node.Type.tenant, Node.State.ready)) {
+ Optional<Instant> lastLocalRequest = hostLivenessTracker.lastRequestFrom(node.hostname());
+ if ( ! lastLocalRequest.isPresent()) continue;
+
+ Optional<History.Event> recordedRequest = node.history().event(History.Event.Type.requested);
+ if ( ! recordedRequest.isPresent() || recordedRequest.get().at().isBefore(lastLocalRequest.get())) {
+ History updatedHistory = node.history().record(new History.Event(History.Event.Type.requested,
+ lastLocalRequest.get()));
+ nodeRepository().write(node.setHistory(updatedHistory));
+ }
+ }
+ }
+ }
+
+ private List<Node> readyNodesWhichAreDead() {
+ // Allow requests some time to be registered in case all config servers have been down
+ if (constructionTime.isAfter(clock.instant().minus(nodeRequestInterval).minus(nodeRequestInterval) ))
+ return Collections.emptyList();
+
// Nodes are taken as dead if they have not made a config request since this instant.
// Add 10 minutes to the down time limit to allow nodes to make a request that infrequently.
- Instant oldestAcceptableRequestTime = clock.instant().minus(downTimeLimit).minus(Duration.ofMinutes(10));
+ Instant oldestAcceptableRequestTime = clock.instant().minus(downTimeLimit).minus(nodeRequestInterval);
- if ( ! hostLivenessTracker.remembersRequestsSince().isBefore(oldestAcceptableRequestTime))
- return Collections.emptyList(); // we haven't tracked long enough to tell if nodes are dead
-
return nodeRepository().getNodes(Node.Type.tenant, Node.State.ready).stream()
.filter(node -> wasMadeReadyBefore(oldestAcceptableRequestTime, node))
- .filter(node -> ! hasRecordedResponseAfter(oldestAcceptableRequestTime, node))
+ .filter(node -> ! hasRecordedRequestAfter(oldestAcceptableRequestTime, node))
.collect(Collectors.toList());
}
@@ -108,13 +125,19 @@ public class NodeFailer extends Maintainer {
if ( ! readiedEvent.isPresent()) return false;
return readiedEvent.get().at().isBefore(instant);
}
-
- private boolean hasRecordedResponseAfter(Instant instant, Node node) {
- Optional<Instant> lastResponse = hostLivenessTracker.lastRequestFrom(node.hostname());
- if ( ! lastResponse.isPresent()) return false;
- return lastResponse.get().isAfter(instant);
+
+ private boolean hasRecordedRequestAfter(Instant instant, Node node) {
+ Optional<History.Event> lastRequest = node.history().event(History.Event.Type.requested);
+ if ( ! lastRequest.isPresent()) return false;
+ return lastRequest.get().at().isAfter(instant);
}
-
+
+ private List<Node> readyNodesWithHardwareFailure() {
+ return nodeRepository().getNodes(Node.Type.tenant, Node.State.ready).stream()
+ .filter(node -> node.status().hardwareFailure().isPresent())
+ .collect(Collectors.toList());
+ }
+
private boolean applicationSuspended(Node node) {
try {
return orchestrator.getApplicationInstanceStatus(node.allocation().get().owner())
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java
index 42134d082b7..488a3e6bc77 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java
@@ -79,6 +79,16 @@ public class History {
/** Returns the empty history */
public static History empty() { return new History(Collections.emptyList()); }
+ @Override
+ public String toString() {
+ if (events.isEmpty()) return "history: (empty)";
+ StringBuilder b = new StringBuilder("history: ");
+ for (Event e : events.values())
+ b.append(e).append(", ");
+ b.setLength(b.length() -2); // remove last comma
+ return b.toString();
+ }
+
/** An event which may happen to a node */
public static class Event {
@@ -96,10 +106,19 @@ public class History {
/** Returns the instant this even took place */
public Instant at() { return at; }
- public enum Type { readied, reserved, activated, retired, deactivated, failed, deallocated, down }
+ public enum Type {
+ // State move events
+ readied, reserved, activated, deactivated, failed, deallocated,
+ // An active node was retired
+ retired,
+ // An active node went down according to the service monitor
+ down,
+ // A node made a config request, indicating it is live
+ requested
+ }
@Override
- public String toString() { return type + " event at " + at; }
+ public String toString() { return "'" + type + "' event at " + at; }
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java
index bf442d4f164..0f0f8454a6f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java
@@ -170,8 +170,8 @@ public class CuratorDatabaseClient {
private History newNodeHistory(Node node, Node.State toState) {
History history = node.history();
- // wipe history to avoid expiring based on events under the previous allocation
- if (toState == Node.State.ready)
+ // wipe history when a node *becomes* ready to avoid expiring based on events under the previous allocation
+ if (node.state() != Node.State.ready && toState == Node.State.ready)
history = History.empty();
return history.recordStateTransition(node.state(), toState, clock.instant());
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
index 84012dd63b7..73d6174c2df 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
@@ -242,6 +242,7 @@ public class NodeSerializer {
case "failed" : return History.Event.Type.failed;
case "deallocated" : return History.Event.Type.deallocated;
case "down" : return History.Event.Type.down;
+ case "requested" : return History.Event.Type.requested;
}
throw new IllegalArgumentException("Unknown node event type '" + eventTypeString + "'");
}
@@ -255,6 +256,7 @@ public class NodeSerializer {
case failed : return "failed";
case deallocated : return "deallocated";
case down : return "down";
+ case requested: return "requested";
}
throw new IllegalArgumentException("Serialized form of '" + nodeEventType + "' not defined");
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index d1f37dccd16..d613b21052a 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -119,7 +119,11 @@ public class NodeFailerTest {
serviceMonitor = new ServiceMonitorStub(apps, nodeRepository);
orchestrator = new OrchestratorMock();
- failer = new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, DOWNTIME_LIMIT_ONE_HOUR, clock, orchestrator);
+ failer = createFailer();
+ }
+
+ private NodeFailer createFailer() {
+ return new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, DOWNTIME_LIMIT_ONE_HOUR, clock, orchestrator);
}
@Test
@@ -199,7 +203,7 @@ public class NodeFailerTest {
allNodesMakeAConfigRequestExcept();
// the system goes down and do not have updated information when coming back
clock.advance(Duration.ofMinutes(120));
- hostLivenessTracker.setConstructedNow();
+ failer = createFailer();
serviceMonitor.setStatusIsKnown(false);
failer.run();
// due to this, nothing is failed
@@ -264,15 +268,7 @@ public class NodeFailerTest {
assertEquals( 2, nodeRepository.getNodes(Node.Type.tenant, Node.State.ready).size());
assertEquals( 2, nodeRepository.getNodes(Node.Type.tenant, Node.State.failed).size());
- // Another ready node die but we restart so we don't have enough information
- clock.advance(Duration.ofMinutes(180));
- hostLivenessTracker.setConstructedNow();
- allNodesMakeAConfigRequestExcept(ready.get(0), ready.get(2), ready.get(3));
- failer.run();
- assertEquals( 2, nodeRepository.getNodes(Node.Type.tenant, Node.State.ready).size());
- assertEquals( 2, nodeRepository.getNodes(Node.Type.tenant, Node.State.failed).size());
-
- // Now we get enough information
+ // Another ready node die
clock.advance(Duration.ofMinutes(180));
allNodesMakeAConfigRequestExcept(ready.get(0), ready.get(2), ready.get(3));
failer.run();
@@ -331,22 +327,12 @@ public class NodeFailerTest {
private static class TestHostLivenessTracker implements HostLivenessTracker {
private final Clock clock;
- private Instant constructionTime;
private final Map<String, Instant> lastRequestFromHost = new HashMap<>();
public TestHostLivenessTracker(Clock clock) {
this.clock = clock;
- this.constructionTime = clock.instant();
}
- public void setConstructedNow() {
- constructionTime = clock.instant();
- lastRequestFromHost.clear();
- }
-
- @Override
- public Instant remembersRequestsSince() { return constructionTime; }
-
@Override
public void receivedRequestFrom(String hostname) {
lastRequestFromHost.put(hostname, clock.instant());