diff options
author | Jon Marius Venstad <jonmv@users.noreply.github.com> | 2021-12-20 15:52:27 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-12-20 15:52:27 +0100 |
commit | 8d2ffbf6d8b82589d2530ba6efec52a95d518bbf (patch) | |
tree | 270af539e171ef4a5ae8509fb8cfa2eeca8a59d9 | |
parent | ac8c1d6b5b1571674d5377a91c0503fc381d67b8 (diff) | |
parent | 7c79c44034636e4b39411682715714971984bddf (diff) |
Merge pull request #20596 from vespa-engine/vekterli/dont-trigger-random-policy-send-for-expected-transient-errors
Don't trigger implicit ContentPolicy random send fallback on expected transient errors [run-systemtest]
5 files changed, 175 insertions, 27 deletions
diff --git a/documentapi/abi-spec.json b/documentapi/abi-spec.json index 88ec090d324..2e68d4803cb 100644 --- a/documentapi/abi-spec.json +++ b/documentapi/abi-spec.json @@ -1565,6 +1565,20 @@ "protected final java.util.Random randomizer" ] }, + "com.yahoo.documentapi.messagebus.protocol.ContentPolicy$InstabilityChecker": { + "superClass": "java.lang.Object", + "interfaces": [], + "attributes": [ + "public", + "interface", + "abstract" + ], + "methods": [ + "public abstract boolean tooManyFailures(int)", + "public abstract void addFailure(java.lang.Integer)" + ], + "fields": [] + }, "com.yahoo.documentapi.messagebus.protocol.ContentPolicy$Parameters": { "superClass": "java.lang.Object", "interfaces": [], @@ -1576,7 +1590,8 @@ "public java.lang.String getClusterName()", "public com.yahoo.documentapi.messagebus.protocol.ContentPolicy$SlobrokHostPatternGenerator createPatternGenerator()", "public com.yahoo.documentapi.messagebus.protocol.ContentPolicy$HostFetcher createHostFetcher(com.yahoo.documentapi.messagebus.protocol.SlobrokPolicy, int)", - "public com.yahoo.vdslib.distribution.Distribution createDistribution(com.yahoo.documentapi.messagebus.protocol.SlobrokPolicy)" + "public com.yahoo.vdslib.distribution.Distribution createDistribution(com.yahoo.documentapi.messagebus.protocol.SlobrokPolicy)", + "public com.yahoo.documentapi.messagebus.protocol.ContentPolicy$InstabilityChecker createInstabilityChecker()" ], "fields": [ "protected final java.lang.String clusterName", @@ -1585,6 +1600,21 @@ "protected final com.yahoo.documentapi.messagebus.protocol.ContentPolicy$SlobrokHostPatternGenerator slobrokHostPatternGenerator" ] }, + "com.yahoo.documentapi.messagebus.protocol.ContentPolicy$PerNodeCountingInstabilityChecker": { + "superClass": "java.lang.Object", + "interfaces": [ + "com.yahoo.documentapi.messagebus.protocol.ContentPolicy$InstabilityChecker" + ], + "attributes": [ + "public" + ], + "methods": [ + "public void <init>(int)", + "public boolean tooManyFailures(int)", + "public void addFailure(java.lang.Integer)" + ], + "fields": [] + }, "com.yahoo.documentapi.messagebus.protocol.ContentPolicy$SlobrokHostFetcher": { "superClass": "com.yahoo.documentapi.messagebus.protocol.ContentPolicy$HostFetcher", "interfaces": [], diff --git a/documentapi/src/main/java/com/yahoo/documentapi/messagebus/protocol/ContentPolicy.java b/documentapi/src/main/java/com/yahoo/documentapi/messagebus/protocol/ContentPolicy.java index 5eaec70ca59..21e621883fe 100644 --- a/documentapi/src/main/java/com/yahoo/documentapi/messagebus/protocol/ContentPolicy.java +++ b/documentapi/src/main/java/com/yahoo/documentapi/messagebus/protocol/ContentPolicy.java @@ -232,6 +232,47 @@ public class ContentPolicy extends SlobrokPolicy { } + /** + * Tracks "instability" across nodes based on number of failures received versus some + * implementation-specific limit. + * + * Implementations must be thread-safe. + * + * TODO should ideally be protected, but there's a package mismatch between policy classes and its tests + */ + public interface InstabilityChecker { + boolean tooManyFailures(int nodeIndex); + void addFailure(Integer calculatedDistributor); + } + + /** Class that tracks a failure of a given type per node. */ + public static class PerNodeCountingInstabilityChecker implements InstabilityChecker { + private final List<Integer> nodeFailures = new CopyOnWriteArrayList<>(); + private final int failureLimit; + + public PerNodeCountingInstabilityChecker(int failureLimit) { + this.failureLimit = failureLimit; + } + + @Override + public boolean tooManyFailures(int nodeIndex) { + if (nodeFailures.size() > nodeIndex && nodeFailures.get(nodeIndex) > failureLimit) { + nodeFailures.set(nodeIndex, 0); + return true; + } else { + return false; + } + } + + @Override + public void addFailure(Integer calculatedDistributor) { + while (nodeFailures.size() <= calculatedDistributor) { + nodeFailures.add(0); + } + nodeFailures.set(calculatedDistributor, nodeFailures.get(calculatedDistributor) + 1); + } + } + /** Class parsing the semicolon separated parameter string and exposes the appropriate value to the policy. */ public static class Parameters { @@ -271,6 +312,9 @@ public class ContentPolicy extends SlobrokPolicy { return distributionConfig == null ? new Distribution(getDistributionConfigId()) : new Distribution(distributionConfig.cluster(clusterName)); } + public InstabilityChecker createInstabilityChecker() { + return new PerNodeCountingInstabilityChecker(getAttemptRandomOnFailuresLimit()); + } /** * When we have gotten this amount of failures from a node (Any kind of failures). We try to send to a random other node, just to see if the @@ -324,27 +368,6 @@ public class ContentPolicy extends SlobrokPolicy { /** Class handling the logic of picking a distributor */ public static class DistributorSelectionLogic { - /** Class that tracks a failure of a given type per node. */ - static class InstabilityChecker { - private final List<Integer> nodeFailures = new CopyOnWriteArrayList<>(); - private final int failureLimit; - - InstabilityChecker(int failureLimit) { this.failureLimit = failureLimit; } - - boolean tooManyFailures(int nodeIndex) { - if (nodeFailures.size() > nodeIndex && nodeFailures.get(nodeIndex) > failureLimit) { - nodeFailures.set(nodeIndex, 0); - return true; - } else { - return false; - } - } - - void addFailure(Integer calculatedDistributor) { - while (nodeFailures.size() <= calculatedDistributor) nodeFailures.add(0); - nodeFailures.set(calculatedDistributor, nodeFailures.get(calculatedDistributor) + 1); - } - } /** Message context class. Contains data we want to inspect about a request at reply time. */ private static class MessageContext { final Integer calculatedDistributor; @@ -375,7 +398,7 @@ public class ContentPolicy extends SlobrokPolicy { try { hostFetcher = params.createHostFetcher(policy, params.getRequiredUpPercentageToSendToKnownGoodNodes()); distribution = params.createDistribution(policy); - persistentFailureChecker = new InstabilityChecker(params.getAttemptRandomOnFailuresLimit()); + persistentFailureChecker = params.createInstabilityChecker(); maxOldClusterVersionBeforeSendingRandom = params.maxOldClusterStatesSeenBeforeThrowingCachedState(); } catch (Throwable e) { destroy(); @@ -556,10 +579,37 @@ public class ContentPolicy extends SlobrokPolicy { } } + /** + * Returns whether a given error Reply should be counted towards potentially ignoring the cached + * cluster state and triggering a random send (and thus likely WrongDistributionReply with the + * current cluster state). Certain error codes may be used frequently by the content layer for + * purposes that do _not_ indicate that a change in cluster state may have happened, and should + * therefore not be counted for this purpose: + * - ERROR_TEST_AND_SET_CONDITION_FAILED: may happen for any mutating operation that has an + * associated TaS condition. Technically an APP_FATAL_ERROR since resending doesn't make sense. + * - ERROR_BUSY: may happen for concurrent mutations and if distributors are in the process of + * changing bucket ownership and the grace period hasn't passed yet. + */ + private static boolean shouldCountAsErrorForRandomSendTrigger(Reply reply) { + if (reply.getNumErrors() != 1) { + return !reply.hasErrors(); // For simplicity, count any reply with > 1 error. + } + var error = reply.getError(0); + switch (error.getCode()) { + // TODO this feels like a layering violation, but we use DocumentProtocol directly in other places in this policy anyway... + case DocumentProtocol.ERROR_TEST_AND_SET_CONDITION_FAILED: + case DocumentProtocol.ERROR_BUSY: + return false; + default: return true; + } + } + void handleErrorReply(Reply reply, Object untypedContext) { MessageContext messageContext = (MessageContext) untypedContext; if (messageContext.calculatedDistributor != null) { - persistentFailureChecker.addFailure(messageContext.calculatedDistributor); + if (shouldCountAsErrorForRandomSendTrigger(reply)) { + persistentFailureChecker.addFailure(messageContext.calculatedDistributor); + } if (reply.getTrace().shouldTrace(1)) { reply.getTrace().trace(1, "Failed with " + messageContext.toString()); } diff --git a/documentapi/src/test/java/com/yahoo/documentapi/messagebus/protocol/test/storagepolicy/ContentPolicyTest.java b/documentapi/src/test/java/com/yahoo/documentapi/messagebus/protocol/test/storagepolicy/ContentPolicyTest.java index 42625cf193d..c160f67aa9b 100644 --- a/documentapi/src/test/java/com/yahoo/documentapi/messagebus/protocol/test/storagepolicy/ContentPolicyTest.java +++ b/documentapi/src/test/java/com/yahoo/documentapi/messagebus/protocol/test/storagepolicy/ContentPolicyTest.java @@ -1,9 +1,13 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.documentapi.messagebus.protocol.test.storagepolicy; +import com.yahoo.documentapi.messagebus.protocol.DocumentProtocol; +import com.yahoo.messagebus.routing.RoutingNode; import org.junit.Ignore; import org.junit.Test; +import static org.junit.Assert.assertEquals; + public class ContentPolicyTest extends Simulator { /** @@ -129,6 +133,46 @@ public class ContentPolicyTest extends Simulator { // Note that we use extra requests here as with only 200 requests there was a pretty good chance of not going to any down node on random anyhow. } + private void setUpSingleNodeFixturesWithInitializedPolicy() { + setClusterNodes(new int[]{ 0 }); + // Seed policy with initial, correct cluster state. + { + RoutingNode target = select(); + replyWrongDistribution(target, "foo", null, "version:1234 distributor:1 storage:1"); + } + } + + @Test + public void transient_errors_expected_during_normal_feed_are_not_counted_as_errors_that_may_trigger_random_send() { + setUpSingleNodeFixturesWithInitializedPolicy(); + var checker = policyFactory.getLastParameters().instabilityChecker; + assertEquals(0, checker.recordedFailures); // WrongDistributionReply not counted as regular error + { + frame.setMessage(createMessage("id:ns:testdoc:n=2:foo")); + RoutingNode target = select(); + replyError(target, new com.yahoo.messagebus.Error(DocumentProtocol.ERROR_BUSY, "Get busy livin' or get busy resendin'")); + } + assertEquals(0, checker.recordedFailures); // BUSY not counted as error + { + frame.setMessage(createMessage("id:ns:testdoc:n=3:foo")); + RoutingNode target = select(); + replyError(target, new com.yahoo.messagebus.Error(DocumentProtocol.ERROR_TEST_AND_SET_CONDITION_FAILED, "oh no")); + } + assertEquals(0, checker.recordedFailures); // TaS failures not counted as error + } + + @Test + public void other_errors_during_feed_are_counted_as_errors_that_may_trigger_random_send() { + setUpSingleNodeFixturesWithInitializedPolicy(); + var checker = policyFactory.getLastParameters().instabilityChecker; + { + frame.setMessage(createMessage("id:ns:testdoc:n=1:foo")); + RoutingNode target = select(); + replyError(target, new com.yahoo.messagebus.Error(DocumentProtocol.ERROR_ABORTED, "shop's closing, go home")); + } + assertEquals(1, checker.recordedFailures); + } + // Left to test? // Cluster state down - Not overwrite last good nodes to send random to? diff --git a/documentapi/src/test/java/com/yahoo/documentapi/messagebus/protocol/test/storagepolicy/ContentPolicyTestEnvironment.java b/documentapi/src/test/java/com/yahoo/documentapi/messagebus/protocol/test/storagepolicy/ContentPolicyTestEnvironment.java index f0c54505bb1..b6893d69325 100644 --- a/documentapi/src/test/java/com/yahoo/documentapi/messagebus/protocol/test/storagepolicy/ContentPolicyTestEnvironment.java +++ b/documentapi/src/test/java/com/yahoo/documentapi/messagebus/protocol/test/storagepolicy/ContentPolicyTestEnvironment.java @@ -145,14 +145,37 @@ public abstract class ContentPolicyTestEnvironment { } } + public static class TestWrappingInstabilityChecker implements ContentPolicy.InstabilityChecker { + + public int recordedFailures = 0; + private final ContentPolicy.InstabilityChecker fwdChecker; + + TestWrappingInstabilityChecker(ContentPolicy.InstabilityChecker fwdChecker) { + this.fwdChecker = fwdChecker; + } + + @Override + public boolean tooManyFailures(int nodeIndex) { + return fwdChecker.tooManyFailures(nodeIndex); + } + + @Override + public void addFailure(Integer calculatedDistributor) { + ++recordedFailures; + fwdChecker.addFailure(calculatedDistributor); + } + } + public static class TestParameters extends ContentPolicy.Parameters { private final TestHostFetcher hostFetcher; private final Distribution distribution; + public final TestWrappingInstabilityChecker instabilityChecker; public TestParameters(String parameters, Set<Integer> nodes) { super(SlobrokPolicy.parse(parameters)); hostFetcher = new TestHostFetcher(getClusterName(), nodes); distribution = new Distribution(Distribution.getDefaultDistributionConfig(2, 10)); + instabilityChecker = new TestWrappingInstabilityChecker(new ContentPolicy.PerNodeCountingInstabilityChecker(5)); } @Override @@ -160,6 +183,9 @@ public abstract class ContentPolicyTestEnvironment { @Override public Distribution createDistribution(SlobrokPolicy policy) { return distribution; } + + @Override + public ContentPolicy.InstabilityChecker createInstabilityChecker() { return instabilityChecker; } } public static class ContentPolicyTestFactory implements RoutingPolicyFactory { @@ -182,8 +208,6 @@ public abstract class ContentPolicyTestEnvironment { } } public TestParameters getLastParameters() { return parameterInstances.getLast(); } - public void destroy() { - } } private int findPreferredAvailableNodeForTestBucket() { diff --git a/documentapi/src/test/java/com/yahoo/documentapi/messagebus/protocol/test/storagepolicy/Simulator.java b/documentapi/src/test/java/com/yahoo/documentapi/messagebus/protocol/test/storagepolicy/Simulator.java index 651a1c15c0b..d40c87e0bd6 100644 --- a/documentapi/src/test/java/com/yahoo/documentapi/messagebus/protocol/test/storagepolicy/Simulator.java +++ b/documentapi/src/test/java/com/yahoo/documentapi/messagebus/protocol/test/storagepolicy/Simulator.java @@ -188,7 +188,7 @@ public abstract class Simulator extends ContentPolicyTestEnvironment { if (badNode != null && randomizer.nextDouble() < badNode.getFailureRate()) { ++failed[half]; switch (badNode.getFailureType()) { - case TRANSIENT_ERROR: replyError(target, new com.yahoo.messagebus.Error(DocumentProtocol.ERROR_BUSY, "Transient error")); break; + case TRANSIENT_ERROR: replyError(target, new com.yahoo.messagebus.Error(DocumentProtocol.ERROR_ABORTED, "Transient error")); break; case FATAL_ERROR: replyError(target, new com.yahoo.messagebus.Error(DocumentProtocol.ERROR_UNPARSEABLE, "Fatal error")); break; case OLD_CLUSTER_STATE: case RESET_CLUSTER_STATE: |