From 9def3f7dafd38cc6fcf5fd703de69d83cd9d4ce5 Mon Sep 17 00:00:00 2001 From: Harald Musum Date: Fri, 30 Apr 2021 09:53:07 +0200 Subject: Change delay when there are transient config errors Delay next config request a random amount of time between 0 and error delay. This is to avoid thundering herd issues. Delay when getting fatal errors are unchanged. --- .../subscription/impl/JRTConfigRequester.java | 27 ++++----- .../java/com/yahoo/vespa/config/TimingValues.java | 16 +++++- .../subscription/impl/JRTConfigRequesterTest.java | 66 +++++++++------------- 3 files changed, 50 insertions(+), 59 deletions(-) diff --git a/config/src/main/java/com/yahoo/config/subscription/impl/JRTConfigRequester.java b/config/src/main/java/com/yahoo/config/subscription/impl/JRTConfigRequester.java index 6878397a403..d8c10d01b37 100644 --- a/config/src/main/java/com/yahoo/config/subscription/impl/JRTConfigRequester.java +++ b/config/src/main/java/com/yahoo/config/subscription/impl/JRTConfigRequester.java @@ -45,7 +45,6 @@ public class JRTConfigRequester implements RequestWaiter { private static final int TRACELEVEL = 6; private final TimingValues timingValues; private boolean fatalFailures = false; - private boolean transientFailures = false; private final ScheduledThreadPoolExecutor scheduler; private Instant noApplicationWarningLogged = Instant.MIN; private static final Duration delayBetweenWarnings = Duration.ofSeconds(60); @@ -128,7 +127,7 @@ public class JRTConfigRequester implements RequestWaiter { trace.trace(TRACELEVEL, "JRTConfigRequester.doHandle()"); log.log(FINEST, () -> trace.toString()); if (validResponse) { - handleOKRequest(jrtReq, sub, connection); + handleOKRequest(jrtReq, sub); } else { logWhenErrorResponse(jrtReq, connection); handleFailedRequest(jrtReq, sub, connection); @@ -165,7 +164,7 @@ public class JRTConfigRequester implements RequestWaiter { } ErrorType errorType = ErrorType.getErrorType(jrtReq.errorCode()); connectionPool.setError(connection, jrtReq.errorCode()); - long delay = calculateFailedRequestDelay(errorType, transientFailures, fatalFailures, timingValues, configured); + long delay = calculateFailedRequestDelay(errorType, fatalFailures, timingValues, configured); if (errorType == ErrorType.TRANSIENT) { handleTransientlyFailed(jrtReq, sub, delay, connection); } else { @@ -173,17 +172,18 @@ public class JRTConfigRequester implements RequestWaiter { } } - static long calculateFailedRequestDelay(ErrorType errorType, boolean transientFailures, boolean fatalFailures, - TimingValues timingValues, boolean configured) { + static long calculateFailedRequestDelay(ErrorType errorType, + boolean fatalFailures, + TimingValues timingValues, + boolean configured) { long delay = (configured ? timingValues.getConfiguredErrorDelay(): timingValues.getUnconfiguredDelay()); switch (errorType) { case TRANSIENT: - if (transientFailures) - delay = delay * 2; + delay = timingValues.getRandomTransientDelay(delay); break; case FATAL: - delay = timingValues.getFixedDelay() + (delay * (fatalFailures ? 1 : 0)); + delay = timingValues.getFixedDelay() + (fatalFailures ? delay : 0); delay = timingValues.getPlusMinusFractionRandom(delay, randomFraction); break; default: @@ -196,7 +196,7 @@ public class JRTConfigRequester implements RequestWaiter { JRTConfigSubscription sub, long delay, Connection connection) { - transientFailures = true; + fatalFailures = false; log.log(INFO, "Connection to " + connection.getAddress() + " failed or timed out, clients will keep existing config, will keep trying."); if (sub.getState() != ConfigSubscription.State.OPEN) return; @@ -229,11 +229,8 @@ public class JRTConfigRequester implements RequestWaiter { scheduleNextRequest(jrtReq, sub, delay, calculateErrorTimeout()); } - private void handleOKRequest(JRTClientConfigRequest jrtReq, - JRTConfigSubscription sub, - Connection connection) { + private void handleOKRequest(JRTClientConfigRequest jrtReq, JRTConfigSubscription sub) { fatalFailures = false; - transientFailures = false; noApplicationWarningLogged = Instant.MIN; sub.setLastCallBackOKTS(Instant.now()); log.log(FINE, () -> "OK response received in handleOkRequest: " + jrtReq); @@ -303,10 +300,6 @@ public class JRTConfigRequester implements RequestWaiter { } } - boolean getTransientFailures() { - return transientFailures; - } - boolean getFatalFailures() { return fatalFailures; } diff --git a/config/src/main/java/com/yahoo/vespa/config/TimingValues.java b/config/src/main/java/com/yahoo/vespa/config/TimingValues.java index fb7ef03e918..235928a7d0b 100644 --- a/config/src/main/java/com/yahoo/vespa/config/TimingValues.java +++ b/config/src/main/java/com/yahoo/vespa/config/TimingValues.java @@ -160,12 +160,22 @@ public class TimingValues { /** * Returns a number +/- a random component * - * @param val input + * @param value input * @param fraction for instance 0.1 for +/- 10% * @return a number */ - public long getPlusMinusFractionRandom(long val, float fraction) { - return Math.round(val - (val * fraction) + (rand.nextFloat() * 2L * val * fraction)); + public long getPlusMinusFractionRandom(long value, float fraction) { + return Math.round(value - (value * fraction) + (rand.nextFloat() * 2L * value * fraction)); + } + + /** + * Returns a number between 0 and maxValue + * + * @param maxValue max maxValue + * @return a number + */ + public long getRandomTransientDelay(long maxValue) { + return Math.round(rand.nextFloat() * maxValue); } @Override diff --git a/config/src/test/java/com/yahoo/config/subscription/impl/JRTConfigRequesterTest.java b/config/src/test/java/com/yahoo/config/subscription/impl/JRTConfigRequesterTest.java index 658404f1772..bf516bee8a9 100644 --- a/config/src/test/java/com/yahoo/config/subscription/impl/JRTConfigRequesterTest.java +++ b/config/src/test/java/com/yahoo/config/subscription/impl/JRTConfigRequesterTest.java @@ -17,6 +17,7 @@ import java.util.Arrays; import java.util.List; import java.util.Random; +import static com.yahoo.config.subscription.impl.JRTConfigRequester.calculateFailedRequestDelay; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; @@ -37,67 +38,54 @@ public class JRTConfigRequesterTest { // transientFailures and fatalFailures are not set until after delay has been calculated, // so false is the case for the first failure - boolean transientFailures = false; boolean fatalFailures = false; boolean configured = false; // First time failure, not configured - long delay = JRTConfigRequester.calculateFailedRequestDelay(ErrorType.TRANSIENT, - transientFailures, fatalFailures, timingValues, configured); - assertEquals(timingValues.getUnconfiguredDelay(), delay); - transientFailures = true; - delay = JRTConfigRequester.calculateFailedRequestDelay(ErrorType.TRANSIENT, - transientFailures, fatalFailures, timingValues, configured); - assertEquals(timingValues.getUnconfiguredDelay() * 2, delay); - transientFailures = false; - - - delay = JRTConfigRequester.calculateFailedRequestDelay(ErrorType.FATAL, - transientFailures, fatalFailures, timingValues, configured); + long delay = calculateFailedRequestDelay(ErrorType.TRANSIENT, fatalFailures, timingValues, configured); + assertTransientDelay(timingValues.getUnconfiguredDelay(), delay); + delay = calculateFailedRequestDelay(ErrorType.TRANSIENT, fatalFailures, timingValues, configured); + assertTransientDelay(timingValues.getUnconfiguredDelay(), delay); + + + delay = calculateFailedRequestDelay(ErrorType.FATAL, fatalFailures, timingValues, configured); assertTrue("delay=" + delay, delay > (1 - JRTConfigRequester.randomFraction) * timingValues.getFixedDelay()); assertTrue("delay=" + delay,delay < (1 + JRTConfigRequester.randomFraction) * timingValues.getFixedDelay()); - assertEquals(5462, delay); + assertEquals(4481, delay); // First time failure, configured configured = true; - delay = JRTConfigRequester.calculateFailedRequestDelay(ErrorType.TRANSIENT, - transientFailures, fatalFailures, timingValues, configured); - assertEquals(timingValues.getConfiguredErrorDelay(), delay); + delay = calculateFailedRequestDelay(ErrorType.TRANSIENT, fatalFailures, timingValues, configured); + assertTransientDelay(timingValues.getConfiguredErrorDelay(), delay); - delay = JRTConfigRequester.calculateFailedRequestDelay(ErrorType.FATAL, - transientFailures, fatalFailures, timingValues, configured); + delay = calculateFailedRequestDelay(ErrorType.FATAL, fatalFailures, timingValues, configured); assertTrue(delay > (1 - JRTConfigRequester.randomFraction) * timingValues.getFixedDelay()); assertTrue(delay < (1 + JRTConfigRequester.randomFraction) * timingValues.getFixedDelay()); - assertEquals(5663, delay); + assertEquals(5275, delay); // nth time failure, not configured fatalFailures = true; configured = false; - delay = JRTConfigRequester.calculateFailedRequestDelay(ErrorType.TRANSIENT, - transientFailures, fatalFailures, timingValues, configured); - assertEquals(timingValues.getUnconfiguredDelay(), delay); - delay = JRTConfigRequester.calculateFailedRequestDelay(ErrorType.FATAL, - transientFailures, fatalFailures, timingValues, configured); + delay = calculateFailedRequestDelay(ErrorType.TRANSIENT, fatalFailures, timingValues, configured); + assertTransientDelay(timingValues.getUnconfiguredDelay(), delay); + delay = calculateFailedRequestDelay(ErrorType.FATAL, fatalFailures, timingValues, configured); final long l = timingValues.getFixedDelay() + timingValues.getUnconfiguredDelay(); assertTrue(delay > (1 - JRTConfigRequester.randomFraction) * l); assertTrue(delay < (1 + JRTConfigRequester.randomFraction) * l); - assertEquals(5377, delay); + assertEquals(6121, delay); // nth time failure, configured fatalFailures = true; - transientFailures = true; configured = true; - delay = JRTConfigRequester.calculateFailedRequestDelay(ErrorType.TRANSIENT, - transientFailures, fatalFailures, timingValues, configured); - assertEquals(timingValues.getConfiguredErrorDelay() * 2, delay); - delay = JRTConfigRequester.calculateFailedRequestDelay(ErrorType.FATAL, - transientFailures, fatalFailures, timingValues, configured); + delay = calculateFailedRequestDelay(ErrorType.TRANSIENT, fatalFailures, timingValues, configured); + assertTransientDelay(timingValues.getConfiguredErrorDelay(), delay); + delay = calculateFailedRequestDelay(ErrorType.FATAL, fatalFailures, timingValues, configured); final long l1 = timingValues.getFixedDelay() + timingValues.getConfiguredErrorDelay(); assertTrue(delay > (1 - JRTConfigRequester.randomFraction) * l1); assertTrue(delay < (1 + JRTConfigRequester.randomFraction) * l1); - assertEquals(20851L, delay); + assertEquals(20780, delay); } @Test @@ -132,7 +120,6 @@ public class JRTConfigRequesterTest { assertTrue(receivedRequest.validateParameters()); assertEquals(timingValues.getSubscribeTimeout(), receivedRequest.getTimeout()); assertFalse(requester.getFatalFailures()); - assertFalse(requester.getTransientFailures()); } @Test @@ -145,7 +132,6 @@ public class JRTConfigRequesterTest { requester.request(createSubscription(subscriber, timingValues)); waitUntilResponse(connection); assertTrue(requester.getFatalFailures()); - assertFalse(requester.getTransientFailures());; } @Test @@ -160,7 +146,6 @@ public class JRTConfigRequesterTest { requester.request(sub); waitUntilResponse(connection); assertTrue(requester.getFatalFailures()); - assertFalse(requester.getTransientFailures());; } @Test @@ -173,7 +158,6 @@ public class JRTConfigRequesterTest { requester.request(createSubscription(subscriber, timingValues)); waitUntilResponse(connection); assertFalse(requester.getFatalFailures()); - assertTrue(requester.getTransientFailures()); } @Test @@ -188,7 +172,6 @@ public class JRTConfigRequesterTest { requester.request(sub); waitUntilResponse(connection); assertFalse(requester.getFatalFailures()); - assertTrue(requester.getTransientFailures()); } @Test @@ -204,7 +187,6 @@ public class JRTConfigRequesterTest { requester.request(sub); waitUntilResponse(connection); assertTrue(requester.getFatalFailures()); - assertFalse(requester.getTransientFailures());; } @Test @@ -338,4 +320,10 @@ public class JRTConfigRequesterTest { requester2.close(); } + private void assertTransientDelay(long maxDelay, long delay) { + long minDelay = 0; + assertTrue("delay=" + delay + ", minDelay=" + minDelay + ",maxDelay=" + maxDelay, + delay >= minDelay && delay <= maxDelay); + } + } -- cgit v1.2.3 From abfa970111fe7c8dfe397ccc191948e47aa9b742 Mon Sep 17 00:00:00 2001 From: Harald Musum Date: Fri, 30 Apr 2021 10:20:02 +0200 Subject: Remove redundant checks for closed subscription --- .../yahoo/config/subscription/impl/JRTConfigRequester.java | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/config/src/main/java/com/yahoo/config/subscription/impl/JRTConfigRequester.java b/config/src/main/java/com/yahoo/config/subscription/impl/JRTConfigRequester.java index d8c10d01b37..d3562a47ea1 100644 --- a/config/src/main/java/com/yahoo/config/subscription/impl/JRTConfigRequester.java +++ b/config/src/main/java/com/yahoo/config/subscription/impl/JRTConfigRequester.java @@ -120,9 +120,10 @@ public class JRTConfigRequester implements RequestWaiter { } private void doHandle(JRTConfigSubscription sub, JRTClientConfigRequest jrtReq, Connection connection) { + if (subscriptionIsClosed(sub)) return; // Avoid error messages etc. after closing + boolean validResponse = jrtReq.validateResponse(); log.log(FINE, () -> "Request callback " + (validResponse ? "valid" : "invalid") + ". Req: " + jrtReq + "\nSpec: " + connection); - if (sub.getState() == ConfigSubscription.State.CLOSED) return; // Avoid error messages etc. after closing Trace trace = jrtReq.getResponseTrace(); trace.trace(TRACELEVEL, "JRTConfigRequester.doHandle()"); log.log(FINEST, () -> trace.toString()); @@ -199,7 +200,6 @@ public class JRTConfigRequester implements RequestWaiter { fatalFailures = false; log.log(INFO, "Connection to " + connection.getAddress() + " failed or timed out, clients will keep existing config, will keep trying."); - if (sub.getState() != ConfigSubscription.State.OPEN) return; scheduleNextRequest(jrtReq, sub, delay, calculateErrorTimeout()); } @@ -216,9 +216,7 @@ public class JRTConfigRequester implements RequestWaiter { * @param sub a config subscription * @param delay delay before sending a new request */ - private void handleFatallyFailed(JRTClientConfigRequest jrtReq, - JRTConfigSubscription sub, long delay) { - if (sub.getState() != ConfigSubscription.State.OPEN) return; + private void handleFatallyFailed(JRTClientConfigRequest jrtReq, JRTConfigSubscription sub, long delay) { fatalFailures = true; // The logging depends on whether we are configured or not. Level logLevel = sub.getConfigState().getConfig() == null ? Level.FINE : Level.INFO; @@ -242,10 +240,13 @@ public class JRTConfigRequester implements RequestWaiter { sub.setException(new ConfigurationRuntimeException("Could not put returned request on queue of subscription " + sub)); } } - if (sub.getState() != ConfigSubscription.State.OPEN) return; scheduleNextRequest(jrtReq, sub, calculateSuccessDelay(), calculateSuccessTimeout()); } + private boolean subscriptionIsClosed(JRTConfigSubscription sub) { + return sub.getState() == ConfigSubscription.State.CLOSED; + } + private long calculateSuccessTimeout() { return timingValues.getPlusMinusFractionRandom(timingValues.getSuccessTimeout(), randomFraction); } -- cgit v1.2.3