diff options
9 files changed, 103 insertions, 25 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGenerator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGenerator.java index 26fdddb01e5..74f3cc276a5 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGenerator.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGenerator.java @@ -132,7 +132,7 @@ public class ClusterStateGenerator { final Map<Node, NodeStateReason> nodeStateReasons = new HashMap<>(); for (final NodeInfo nodeInfo : cluster.getNodeInfo()) { - final NodeState nodeState = computeEffectiveNodeState(nodeInfo, params); + final NodeState nodeState = computeEffectiveNodeState(nodeInfo, params, nodeStateReasons); workingState.setNodeState(nodeInfo.getNode(), nodeState); } @@ -159,7 +159,10 @@ public class ClusterStateGenerator { baseline.setDescription(wanted.getDescription()); } - private static NodeState computeEffectiveNodeState(final NodeInfo nodeInfo, final Params params) { + private static NodeState computeEffectiveNodeState(final NodeInfo nodeInfo, + final Params params, + Map<Node, NodeStateReason> nodeStateReasons) + { final NodeState reported = nodeInfo.getReportedState(); final NodeState wanted = nodeInfo.getWantedState(); final NodeState baseline = reported.clone(); @@ -171,7 +174,7 @@ public class ClusterStateGenerator { baseline.setStartTimestamp(0); } if (nodeInfo.isStorage()) { - applyStorageSpecificStateTransforms(nodeInfo, params, reported, wanted, baseline); + applyStorageSpecificStateTransforms(nodeInfo, params, reported, wanted, baseline, nodeStateReasons); } if (baseline.above(wanted)) { applyWantedStateToBaselineState(baseline, wanted); @@ -181,7 +184,8 @@ public class ClusterStateGenerator { } private static void applyStorageSpecificStateTransforms(NodeInfo nodeInfo, Params params, NodeState reported, - NodeState wanted, NodeState baseline) + NodeState wanted, NodeState baseline, + Map<Node, NodeStateReason> nodeStateReasons) { if (reported.getState() == State.INITIALIZING) { if (timedOutWithoutNewInitProgress(reported, nodeInfo, params) @@ -195,7 +199,7 @@ public class ClusterStateGenerator { } } // TODO ensure that maintenance cannot override Down for any other cases - if (withinTemporalMaintenancePeriod(nodeInfo, baseline, params) && wanted.getState() != State.DOWN) { + if (withinTemporalMaintenancePeriod(nodeInfo, baseline, nodeStateReasons, params) && wanted.getState() != State.DOWN) { baseline.setState(State.MAINTENANCE); } } @@ -244,13 +248,18 @@ public class ClusterStateGenerator { */ private static boolean withinTemporalMaintenancePeriod(final NodeInfo nodeInfo, final NodeState baseline, + Map<Node, NodeStateReason> nodeStateReasons, final Params params) { final Integer transitionTime = params.transitionTimes.get(nodeInfo.getNode().getType()); if (transitionTime == 0 || !baseline.getState().oneOf("sd")) { return false; } - return nodeInfo.getTransitionTime() + transitionTime > params.currentTimeInMillis; + if (nodeInfo.getTransitionTime() + transitionTime > params.currentTimeInMillis) { + return true; + } + nodeStateReasons.put(nodeInfo.getNode(), NodeStateReason.NODE_NOT_BACK_UP_WITHIN_GRACE_PERIOD); + return false; } private static void takeDownGroupsWithTooLowAvailability(final ClusterState workingState, diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/EventDiffCalculator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/EventDiffCalculator.java index cadc065dd51..2f065a9ba75 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/EventDiffCalculator.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/EventDiffCalculator.java @@ -31,6 +31,7 @@ public class EventDiffCalculator { ClusterStateBundle fromState; ClusterStateBundle toState; long currentTime; + long maxMaintenanceGracePeriodTimeMs; public Params cluster(ContentCluster cluster) { this.cluster = cluster; @@ -48,6 +49,10 @@ public class EventDiffCalculator { this.currentTime = time; return this; } + public Params maxMaintenanceGracePeriodTimeMs(long timeMs) { + this.maxMaintenanceGracePeriodTimeMs = timeMs; + return this; + } } public static Params params() { return new Params(); } @@ -58,17 +63,20 @@ public class EventDiffCalculator { final AnnotatedClusterState fromState; final AnnotatedClusterState toState; final long currentTime; + final long maxMaintenanceGracePeriodTimeMs; PerStateParams(ContentCluster cluster, Optional<String> bucketSpace, AnnotatedClusterState fromState, AnnotatedClusterState toState, - long currentTime) { + long currentTime, + long maxMaintenanceGracePeriodTimeMs) { this.cluster = cluster; this.bucketSpace = bucketSpace; this.fromState = fromState; this.toState = toState; this.currentTime = currentTime; + this.maxMaintenanceGracePeriodTimeMs = maxMaintenanceGracePeriodTimeMs; } } @@ -86,7 +94,8 @@ public class EventDiffCalculator { Optional.empty(), params.fromState.getBaselineAnnotatedState(), params.toState.getBaselineAnnotatedState(), - params.currentTime); + params.currentTime, + params.maxMaintenanceGracePeriodTimeMs); } private static void emitWholeClusterDiffEvent(final PerStateParams params, final List<Event> events) { @@ -137,11 +146,10 @@ public class EventDiffCalculator { final NodeState nodeTo = toState.getNodeState(n); if (!nodeTo.equals(nodeFrom)) { final NodeInfo info = cluster.getNodeInfo(n); - events.add(createNodeEvent(info, String.format("Altered node state in cluster state from '%s' to '%s'", - nodeFrom.toString(true), nodeTo.toString(true)), params)); - NodeStateReason prevReason = params.fromState.getNodeStateReasons().get(n); NodeStateReason currReason = params.toState.getNodeStateReasons().get(n); + // Add specific reason events for node edge _before_ the actual transition event itself. + // This makes the timeline of events more obvious. if (isGroupDownEdge(prevReason, currReason)) { events.add(createNodeEvent(info, "Group node availability is below configured threshold", params)); } else if (isGroupUpEdge(prevReason, currReason)) { @@ -150,7 +158,12 @@ public class EventDiffCalculator { events.add(createNodeEvent(info, "Node may have merges pending", params)); } else if (isMayHaveMergesPendingDownEdge(prevReason, currReason)) { events.add(createNodeEvent(info, "Node no longer has merges pending", params)); + } else if (isMaintenanceGracePeriodExceededDownEdge(prevReason, currReason, nodeFrom, nodeTo)) { + events.add(createNodeEvent(info, String.format("Exceeded implicit maintenance mode grace period of " + + "%d milliseconds. Marking node down.", params.maxMaintenanceGracePeriodTimeMs), params)); } + events.add(createNodeEvent(info, String.format("Altered node state in cluster state from '%s' to '%s'", + nodeFrom.toString(true), nodeTo.toString(true)), params)); } } @@ -178,6 +191,14 @@ public class EventDiffCalculator { return prevReason == NodeStateReason.MAY_HAVE_MERGES_PENDING && currReason != NodeStateReason.MAY_HAVE_MERGES_PENDING; } + private static boolean isMaintenanceGracePeriodExceededDownEdge(NodeStateReason prevReason, NodeStateReason currReason, + NodeState prevState, NodeState currState) { + return (prevReason != NodeStateReason.NODE_NOT_BACK_UP_WITHIN_GRACE_PERIOD && + currReason == NodeStateReason.NODE_NOT_BACK_UP_WITHIN_GRACE_PERIOD && + prevState.getState() == State.MAINTENANCE && + currState.getState() == State.DOWN); + } + private static boolean clusterHasTransitionedToUpState(ClusterState prevState, ClusterState currentState) { return prevState.getClusterState() != State.UP && currentState.getClusterState() == State.UP; } @@ -207,7 +228,8 @@ public class EventDiffCalculator { Optional.of(bucketSpace), fromDerivedState, toDerivedState, - params.currentTime); + params.currentTime, + params.maxMaintenanceGracePeriodTimeMs); } } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java index 364184331a8..b4965fbff7b 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java @@ -928,7 +928,8 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd .cluster(cluster) .fromState(fromState) .toState(toState) - .currentTimeMs(timeNowMs)); + .currentTimeMs(timeNowMs) + .maxMaintenanceGracePeriodTimeMs(options.storageNodeMaxTransitionTimeMs())); for (Event event : deltaEvents) { eventLog.add(event, isMaster); } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java index f49b626d347..5e9e91e1cb6 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java @@ -157,6 +157,10 @@ public class FleetControllerOptions implements Cloneable { this.maxDeferredTaskVersionWaitTime = maxDeferredTaskVersionWaitTime; } + public long storageNodeMaxTransitionTimeMs() { + return maxTransitionTime.getOrDefault(NodeType.STORAGE, 10_000); + } + public FleetControllerOptions clone() { try { // TODO: This should deep clone diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateReason.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateReason.java index 7a6be664ec8..3f550724cef 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateReason.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateReason.java @@ -6,8 +6,8 @@ public enum NodeStateReason { // FIXME some of these reasons may be unnecessary as they are reported implicitly by reported/wanted state changes NODE_TOO_UNSTABLE, WITHIN_MAINTENANCE_GRACE_PERIOD, + NODE_NOT_BACK_UP_WITHIN_GRACE_PERIOD, FORCED_INTO_MAINTENANCE, GROUP_IS_DOWN, MAY_HAVE_MERGES_PENDING - } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/StateChangeHandler.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/StateChangeHandler.java index 3f38ea6c018..3c19f70d1e2 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/StateChangeHandler.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/StateChangeHandler.java @@ -305,10 +305,6 @@ public class StateChangeHandler { if (nodeStillUnavailableAfterTransitionTimeExceeded( currentTime, node, currentStateInSystem, lastReportedState)) { - eventLog.add(NodeEvent.forBaseline(node, String.format( - "%d milliseconds without contact. Marking node down.", - currentTime - node.getTransitionTime()), - NodeEvent.Type.CURRENT, currentTime), isMaster); triggeredAnyTimers = true; } diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGeneratorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGeneratorTest.java index 2f91e569b89..08329c874b5 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGeneratorTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGeneratorTest.java @@ -12,7 +12,9 @@ import java.util.Optional; import static com.yahoo.vespa.clustercontroller.core.matchers.HasStateReasonForNode.hasStateReasonForNode; import static com.yahoo.vespa.clustercontroller.core.ClusterFixture.storageNode; +import static com.yahoo.vespa.clustercontroller.core.ClusterFixture.distributorNode; +import static org.hamcrest.CoreMatchers.not; import static org.hamcrest.core.IsEqual.equalTo; import static org.hamcrest.core.Is.is; import static org.junit.Assert.assertThat; @@ -340,6 +342,8 @@ public class ClusterStateGeneratorTest { final AnnotatedClusterState state = ClusterStateGenerator.generatedStateFrom(params); assertThat(state.toString(), equalTo("distributor:5 storage:5 .1.s:d")); + assertThat(state.getNodeStateReasons(), + hasStateReasonForNode(storageNode(1), NodeStateReason.NODE_NOT_BACK_UP_WITHIN_GRACE_PERIOD)); } @Test @@ -355,6 +359,8 @@ public class ClusterStateGeneratorTest { final AnnotatedClusterState state = ClusterStateGenerator.generatedStateFrom(params); assertThat(state.toString(), equalTo("distributor:5 .2.s:d storage:5")); + assertThat(state.getNodeStateReasons(), + not(hasStateReasonForNode(distributorNode(1), NodeStateReason.NODE_NOT_BACK_UP_WITHIN_GRACE_PERIOD))); } @Test diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/EventDiffCalculatorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/EventDiffCalculatorTest.java index e7c4bbfcaa8..ab8d73be99d 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/EventDiffCalculatorTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/EventDiffCalculatorTest.java @@ -32,6 +32,7 @@ public class EventDiffCalculatorTest { Map<String, AnnotatedClusterState.Builder> derivedBefore = new HashMap<>(); Map<String, AnnotatedClusterState.Builder> derivedAfter = new HashMap<>(); long currentTimeMs = 0; + long maxMaintenanceGracePeriodTimeMs = 10_000; EventFixture(int nodeCount) { this.clusterFixture = ClusterFixture.forFlatCluster(nodeCount); @@ -65,6 +66,10 @@ public class EventDiffCalculatorTest { this.currentTimeMs = timeMs; return this; } + EventFixture maxMaintenanceGracePeriodTimeMs(long timeMs) { + this.maxMaintenanceGracePeriodTimeMs = timeMs; + return this; + } EventFixture derivedClusterStateBefore(String bucketSpace, String stateStr) { getBuilder(derivedBefore, bucketSpace).clusterState(stateStr); return this; @@ -91,7 +96,8 @@ public class EventDiffCalculatorTest { .cluster(clusterFixture.cluster()) .fromState(ClusterStateBundle.of(baselineBefore.build(), toDerivedStates(derivedBefore))) .toState(ClusterStateBundle.of(baselineAfter.build(), toDerivedStates(derivedAfter))) - .currentTimeMs(currentTimeMs)); + .currentTimeMs(currentTimeMs) + .maxMaintenanceGracePeriodTimeMs(maxMaintenanceGracePeriodTimeMs)); } private static Map<String, AnnotatedClusterState> toDerivedStates(Map<String, AnnotatedClusterState.Builder> derivedBuilders) { @@ -407,4 +413,35 @@ public class EventDiffCalculatorTest { nodeEventWithDescription("Altered node state in cluster state from 'U' to 'M'")))); } + @Test + public void storage_node_passed_maintenance_grace_period_emits_event() { + final EventFixture fixture = EventFixture.createForNodes(3) + .clusterStateBefore("distributor:3 storage:3 .0.s:m") + .clusterStateAfter("distributor:3 storage:3 .0.s:d") + .maxMaintenanceGracePeriodTimeMs(123_456) + .storageNodeReasonAfter(0, NodeStateReason.NODE_NOT_BACK_UP_WITHIN_GRACE_PERIOD); + + final List<Event> events = fixture.computeEventDiff(); + // Down edge event + event explaining why the node went down + assertThat(events.size(), equalTo(2)); + assertThat(events, hasItem(allOf( + eventForNode(storageNode(0)), + nodeEventWithDescription("Exceeded implicit maintenance mode grace period of 123456 milliseconds. Marking node down."), + nodeEventForBaseline()))); + } + + @Test + public void storage_node_maintenance_grace_period_event_only_emitted_on_maintenance_to_down_edge() { + final EventFixture fixture = EventFixture.createForNodes(3) + .clusterStateBefore("distributor:3 storage:3 .0.s:u") + .clusterStateAfter("distributor:3 storage:3 .0.s:d") + .maxMaintenanceGracePeriodTimeMs(123_456) + .storageNodeReasonAfter(0, NodeStateReason.NODE_NOT_BACK_UP_WITHIN_GRACE_PERIOD); + final List<Event> events = fixture.computeEventDiff(); + assertThat(events.size(), equalTo(1)); + assertThat(events, hasItem(allOf( + eventForNode(storageNode(0)), + nodeEventForBaseline()))); + } + } diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java index 016cce9580e..2c8220c0dba 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java @@ -247,7 +247,7 @@ public class StateChangeTest extends FleetControllerTest { "Event: storage.0: Failed to get node state: D: Closed at other end\n" + "Event: storage.0: Stopped or possibly crashed after 1000 ms, which is before stable state time period. Premature crash count is now 1.\n" + "Event: storage.0: Altered node state in cluster state from 'U' to 'M: Closed at other end'\n" + - "Event: storage.0: 5001 milliseconds without contact. Marking node down.\n" + + "Event: storage.0: Exceeded implicit maintenance mode grace period of 5000 milliseconds. Marking node down.\n" + "Event: storage.0: Altered node state in cluster state from 'M: Closed at other end' to 'D: Closed at other end'\n" + "Event: storage.0: Now reporting state U, t 12345679\n" + "Event: storage.0: Altered node state in cluster state from 'D: Closed at other end' to 'U, t 12345679'\n"); @@ -326,7 +326,7 @@ public class StateChangeTest extends FleetControllerTest { "Event: storage.0: Altered node state in cluster state from 'D: Node not seen in slobrok.' to 'U'\n" + "Event: storage.0: Failed to get node state: D: controlled shutdown\n" + "Event: storage.0: Altered node state in cluster state from 'U' to 'M: controlled shutdown'\n" + - "Event: storage.0: 5001 milliseconds without contact. Marking node down.\n" + + "Event: storage.0: Exceeded implicit maintenance mode grace period of 5000 milliseconds. Marking node down.\n" + "Event: storage.0: Altered node state in cluster state from 'M: controlled shutdown' to 'D: controlled shutdown'\n" + "Event: storage.0: Now reporting state U\n" + "Event: storage.0: Altered node state in cluster state from 'D: controlled shutdown' to 'U'\n"); @@ -569,7 +569,7 @@ public class StateChangeTest extends FleetControllerTest { "Event: storage.6: Altered node state in cluster state from 'D: Node not seen in slobrok.' to 'U'\n" + "Event: storage.6: Failed to get node state: D: Connection error: Closed at other end\n" + "Event: storage.6: Altered node state in cluster state from 'U' to 'M: Connection error: Closed at other end'\n" + - "Event: storage.6: 100000 milliseconds without contact. Marking node down.\n" + + "Event: storage.6: Exceeded implicit maintenance mode grace period of 5000 milliseconds. Marking node down.\n" + "Event: storage.6: Altered node state in cluster state from 'M: Connection error: Closed at other end' to 'D: Connection error: Closed at other end'\n" + "Event: storage.6: Now reporting state I, i 0.00100 (ls)\n" + "Event: storage.6: Now reporting state I, i 0.100 (read)\n" + @@ -650,7 +650,7 @@ public class StateChangeTest extends FleetControllerTest { "Event: storage.6: Altered node state in cluster state from 'D: Node not seen in slobrok.' to 'U'\n" + "Event: storage.6: Failed to get node state: D: Connection error: Closed at other end\n" + "Event: storage.6: Altered node state in cluster state from 'U' to 'M: Connection error: Closed at other end'\n" + - "Event: storage.6: 1000000 milliseconds without contact. Marking node down.\n" + + "Event: storage.6: Exceeded implicit maintenance mode grace period of 5000 milliseconds. Marking node down.\n" + "Event: storage.6: Altered node state in cluster state from 'M: Connection error: Closed at other end' to 'D: Connection error: Closed at other end'\n" + "Event: storage.6: Now reporting state I, i 0.100 (read)\n" + "Event: storage.6: Altered node state in cluster state from 'D: Connection error: Closed at other end' to 'I, i 0.100 (read)'\n" + @@ -1213,8 +1213,11 @@ public class StateChangeTest extends FleetControllerTest { "Event: storage.2: Altered node state in cluster state from 'D: Node not seen in slobrok.' to 'U'\n" + "Event: storage.2: Failed to get node state: D: foo\n" + "Event: storage.2: Stopped or possibly crashed after 500 ms, which is before stable state time period. Premature crash count is now 1.\n" + - "Event: storage.2: Altered node state in cluster state from 'U' to 'M: foo'\n" + - "Event: storage.2: 5000 milliseconds without contact. Marking node down.\n"); + "Event: storage.2: Altered node state in cluster state from 'U' to 'M: foo'\n"); + // Note: even though max transition time has passed, events are now emitted only on cluster state + // publish edges. These are currently suppressed when the cluster state is down, as all cluster down + // states are considered similar to other cluster down states. This is not necessarily optimal, but + // if the cluster is down there are bigger problems than not having some debug events logged. } @Test |