From a12ef29bb2649094d9f8647255c45ce7e0ca4ff7 Mon Sep 17 00:00:00 2001 From: HÃ¥kon Hallingstad Date: Tue, 2 Aug 2022 12:11:34 +0200 Subject: Make some ConvergenceException increment the unhandled_exceptions metric --- .../admin/configserver/ConnectionException.java | 2 +- .../node/admin/configserver/HttpException.java | 9 ++----- .../noderepository/NodeRepositoryException.java | 6 +---- .../orchestrator/OrchestratorException.java | 7 +++++- .../orchestrator/OrchestratorImpl.java | 6 ++--- .../OrchestratorNotFoundException.java | 2 +- .../node/admin/maintenance/StorageMaintainer.java | 6 ++--- .../admin/maintenance/coredump/CoreCollector.java | 8 +++--- .../maintenance/coredump/CoredumpHandler.java | 2 +- .../node/admin/nodeadmin/ConvergenceException.java | 29 +++++++++++++++++++--- .../admin/nodeadmin/NodeAdminStateUpdater.java | 4 +-- .../hosted/node/admin/nodeagent/NodeAgentImpl.java | 10 +++++--- .../node/admin/task/util/network/IPAddresses.java | 4 +-- .../node/admin/nodeagent/NodeAgentImplTest.java | 4 +-- 14 files changed, 60 insertions(+), 39 deletions(-) diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/ConnectionException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/ConnectionException.java index da93083a6de..c78d60e9950 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/ConnectionException.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/ConnectionException.java @@ -15,7 +15,7 @@ import java.net.SocketTimeoutException; public class ConnectionException extends ConvergenceException { private ConnectionException(String message, Throwable cause) { - super(message, cause); + super(message, cause, true); } /** diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/HttpException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/HttpException.java index 4de7582fb7b..53e15b6c647 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/HttpException.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/HttpException.java @@ -14,20 +14,15 @@ public class HttpException extends ConvergenceException { private final boolean isRetryable; private HttpException(int statusCode, String message, boolean isRetryable) { - super("HTTP status code " + statusCode + ": " + message); + super("HTTP status code " + statusCode + ": " + message, null, !isRetryable); this.isRetryable = isRetryable; } private HttpException(Response.Status status, String message, boolean isRetryable) { - super(status.toString() + " (" + status.getStatusCode() + "): " + message); + super(status.toString() + " (" + status.getStatusCode() + "): " + message, null, !isRetryable); this.isRetryable = isRetryable; } - private HttpException(String message) { - super(message); - this.isRetryable = false; - } - boolean isRetryable() { return isRetryable; } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/noderepository/NodeRepositoryException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/noderepository/NodeRepositoryException.java index a0b0499bb1e..636813a2169 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/noderepository/NodeRepositoryException.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/noderepository/NodeRepositoryException.java @@ -5,10 +5,6 @@ import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException; public class NodeRepositoryException extends ConvergenceException { public NodeRepositoryException(String message) { - super(message); - } - - public NodeRepositoryException(String message, Exception exception) { - super(message, exception); + super(message, null, true); } } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorException.java index 9b8a749c33d..917b65b606c 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorException.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorException.java @@ -5,7 +5,12 @@ import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException; @SuppressWarnings("serial") public class OrchestratorException extends ConvergenceException { + /** Creates a transient convergence exception. */ public OrchestratorException(String message) { - super(message); + this(message, true); + } + + protected OrchestratorException(String message, boolean isError) { + super(message, null, isError); } } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java index 281d43e0afc..858bce27ed8 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java @@ -59,7 +59,7 @@ public class OrchestratorImpl implements Orchestrator { } catch (HttpException e) { throw new OrchestratorException("Failed to suspend " + hostName + ": " + e.toString()); } catch (ConnectionException e) { - throw new ConvergenceException("Failed to suspend " + hostName + ": " + e.getMessage()); + throw ConvergenceException.ofTransient("Failed to suspend " + hostName + ": " + e.getMessage()); } catch (RuntimeException e) { throw new RuntimeException("Got error on suspend", e); } @@ -105,7 +105,7 @@ public class OrchestratorImpl implements Orchestrator { } catch (HttpException e) { throw new OrchestratorException("Failed to batch suspend for " + parentHostName + ": " + e.toString()); } catch (ConnectionException e) { - throw new ConvergenceException("Failed to batch suspend for " + parentHostName + ": " + e.getMessage()); + throw ConvergenceException.ofTransient("Failed to batch suspend for " + parentHostName + ": " + e.getMessage()); } catch (RuntimeException e) { throw new RuntimeException("Got error on batch suspend for " + parentHostName + ", with nodes " + hostNames, e); } @@ -126,7 +126,7 @@ public class OrchestratorImpl implements Orchestrator { } catch (HttpException e) { throw new OrchestratorException("Failed to resume " + hostName + ": " + e.toString()); } catch (ConnectionException e) { - throw new ConvergenceException("Failed to resume " + hostName + ": " + e.getMessage()); + throw ConvergenceException.ofTransient("Failed to resume " + hostName + ": " + e.getMessage()); } catch (RuntimeException e) { throw new RuntimeException("Got error on resume", e); } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorNotFoundException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorNotFoundException.java index 3b016aac03f..a6a54807e56 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorNotFoundException.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorNotFoundException.java @@ -4,6 +4,6 @@ package com.yahoo.vespa.hosted.node.admin.configserver.orchestrator; @SuppressWarnings("serial") public class OrchestratorNotFoundException extends OrchestratorException { public OrchestratorNotFoundException(String message) { - super(message); + super(message, true); } } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java index c893f7ffee4..062a7ce018d 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java @@ -120,7 +120,7 @@ public class StorageMaintainer { String[] results = output.split("\t"); if (results.length != 2) { - throw new ConvergenceException("Result from disk usage command not as expected: " + output); + throw ConvergenceException.ofError("Result from disk usage command not as expected: " + output); } return DiskSize.of(Long.parseLong(results[0]), DiskSize.Unit.kiB); @@ -226,11 +226,11 @@ public class StorageMaintainer { String output = uncheck(() -> Files.readAllLines(Paths.get("/proc/cpuinfo")).stream() .filter(line -> line.startsWith("microcode")) .findFirst() - .orElseThrow(() -> new ConvergenceException("No microcode information found in /proc/cpuinfo"))); + .orElseThrow(() -> ConvergenceException.ofError("No microcode information found in /proc/cpuinfo"))); String[] results = output.split(":"); if (results.length != 2) { - throw new ConvergenceException("Result from detect microcode command not as expected: " + output); + throw ConvergenceException.ofError("Result from detect microcode command not as expected: " + output); } return results[1].trim(); diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java index 45dbfd07209..0933f22dee3 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java @@ -51,7 +51,7 @@ public class CoreCollector { Matcher matcher = CORE_GENERATOR_PATH_PATTERN.matcher(result.getOutput()); if (! matcher.find()) { - throw new ConvergenceException(String.format("Failed to extract binary path from GDB, result: %s, command: %s", + throw ConvergenceException.ofError(String.format("Failed to extract binary path from GDB, result: %s, command: %s", asString(result), Arrays.toString(wrappedCommand))); } return matcher.group("path").split(" ")[0]; @@ -62,7 +62,7 @@ public class CoreCollector { try { CommandResult result = container.executeCommandInContainer(context, context.users().root(), command); if (result.getExitCode() != 0) { - throw new ConvergenceException("file command failed with " + asString(result)); + throw ConvergenceException.ofError("file command failed with " + asString(result)); } Matcher execfnMatcher = EXECFN_PATH_PATTERN.matcher(result.getOutput()); @@ -89,7 +89,7 @@ public class CoreCollector { CommandResult result = container.executeCommandInContainer(context, context.users().root(), command); if (result.getExitCode() != 0) - throw new ConvergenceException("Failed to read backtrace " + asString(result) + ", Command: " + Arrays.toString(command)); + throw ConvergenceException.ofError("Failed to read backtrace " + asString(result) + ", Command: " + Arrays.toString(command)); return List.of(result.getOutput().split("\n")); } @@ -99,7 +99,7 @@ public class CoreCollector { CommandResult result = container.executeCommandInContainer(context, context.users().root(), command); if (result.getExitCode() != 0) - throw new ConvergenceException("Failed to read jstack " + asString(result) + ", Command: " + Arrays.toString(command)); + throw ConvergenceException.ofError("Failed to read jstack " + asString(result) + ", Command: " + Arrays.toString(command)); return List.of(result.getOutput().split("\n")); } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java index 6295765a95f..ece494a34d7 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java @@ -95,7 +95,7 @@ public class CoredumpHandler { .map(FileFinder.FileAttributes::filename) .toList(); if (!pendingCores.isEmpty()) - throw new ConvergenceException(String.format("Cannot process %s coredumps: Still being written", + throw ConvergenceException.ofError(String.format("Cannot process %s coredumps: Still being written", pendingCores.size() < 5 ? pendingCores : pendingCores.size())); } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ConvergenceException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ConvergenceException.java index 16a5eb022ad..c1b86fc7fe2 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ConvergenceException.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ConvergenceException.java @@ -8,11 +8,34 @@ package com.yahoo.vespa.hosted.node.admin.nodeadmin; */ @SuppressWarnings("serial") public class ConvergenceException extends RuntimeException { - public ConvergenceException(String message) { - super(message); + /** Create an exception that will NOT increment the monitored unhandled_exceptions metric. */ + public static ConvergenceException ofTransient(String message) { return ofTransient(message, null); } + + /** Create an exception that will NOT increment the monitored unhandled_exceptions metric. */ + public static ConvergenceException ofTransient(String message, Throwable t) { return new ConvergenceException(message, t, false); } + + /** Create an exception that increments the monitored unhandled_exceptions metric. */ + public static ConvergenceException ofError(String message) { return ofError(message, null); } + + /** Create an exception that increments the monitored unhandled_exceptions metric. */ + public static ConvergenceException ofError(String message, Throwable t) { return new ConvergenceException(message, t, true); } + + /** Create an exception with the same transient/error as the cause. */ + public static ConvergenceException ofNested(String message, ConvergenceException cause) { return new ConvergenceException(message, cause, cause.isError); } + + private final boolean isError; + + /** @param isError whether the exception should increment the monitored unhandled_exception metric. */ + protected ConvergenceException(String message, boolean isError) { + this(message, null, isError); } - public ConvergenceException(String message, Throwable t) { + /** @param isError whether the exception should increment the monitored unhandled_exception metric. */ + protected ConvergenceException(String message, Throwable t, boolean isError) { super(message, t); + this.isError = isError; } + + /** Whether the exception signals an error someone may want to look at, or whether it is expected to be transient (false). */ + public boolean isError() { return isError; } } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java index dda404797d9..314844dc6eb 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java @@ -110,7 +110,7 @@ public class NodeAdminStateUpdater { if (hostIsActiveInNR) orchestrator.resume(hostHostname); - throw new ConvergenceException("Timed out trying to freeze all nodes: will force an unfrozen tick"); + throw ConvergenceException.ofTransient("Timed out trying to freeze all nodes: will force an unfrozen tick"); } boolean wantFrozen = wantedState != RESUMED; @@ -118,7 +118,7 @@ public class NodeAdminStateUpdater { currentState = TRANSITIONING; if (!nodeAdmin.setFrozen(wantFrozen)) - throw new ConvergenceException("NodeAdmin is not yet " + (wantFrozen ? "frozen" : "unfrozen")); + throw ConvergenceException.ofTransient("NodeAdmin is not yet " + (wantFrozen ? "frozen" : "unfrozen")); switch (wantedState) { case RESUMED: diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 09bc58bdaa2..6408132fe0b 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -244,7 +244,7 @@ public class NodeAgentImpl implements NodeAgent { hasResumedNode = false; context.log(logger, "Container successfully started, new containerState is " + containerState); return containerOperations.getContainer(context).orElseThrow(() -> - new ConvergenceException("Did not find container that was just started")); + ConvergenceException.ofError("Did not find container that was just started")); } private Optional removeContainerIfNeededUpdateContainerState( @@ -399,7 +399,7 @@ public class NodeAgentImpl implements NodeAgent { // Only update CPU resources containerOperations.updateContainer(context, existingContainer.id(), wantedContainerResources.withMemoryBytes(existingContainer.resources().memoryBytes())); return containerOperations.getContainer(context).orElseThrow(() -> - new ConvergenceException("Did not find container that was just updated")); + ConvergenceException.ofError("Did not find container that was just updated")); } private ContainerResources getContainerResources(NodeAgentContext context) { @@ -435,6 +435,8 @@ public class NodeAgentImpl implements NodeAgent { context.log(logger, Level.INFO, "Converged"); } catch (ConvergenceException e) { context.log(logger, e.getMessage()); + if (e.isError()) + numberOfUnhandledException++; } catch (Throwable e) { numberOfUnhandledException++; context.log(logger, Level.SEVERE, "Unhandled exception, ignoring", e); @@ -501,7 +503,7 @@ public class NodeAgentImpl implements NodeAgent { Duration timeLeft = Duration.between(clock.instant(), firstSuccessfulHealthCheckInstant.get().plus(warmUpDuration(context))); if (!container.get().resources().equalsCpu(getContainerResources(context))) - throw new ConvergenceException("Refusing to resume until warm up period ends (" + + throw ConvergenceException.ofTransient("Refusing to resume until warm up period ends (" + (timeLeft.isNegative() ? "next tick" : "in " + timeLeft) + ")"); } serviceDumper.processServiceDumpRequest(context); @@ -536,7 +538,7 @@ public class NodeAgentImpl implements NodeAgent { nodeRepository.setNodeState(context.hostname().value(), NodeState.ready); break; default: - throw new ConvergenceException("UNKNOWN STATE " + node.state().name()); + throw ConvergenceException.ofError("UNKNOWN STATE " + node.state().name()); } } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/network/IPAddresses.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/network/IPAddresses.java index 07e172c5117..148d80c9803 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/network/IPAddresses.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/network/IPAddresses.java @@ -67,7 +67,7 @@ public interface IPAddresses { if (ipv6addresses.size() <= 1) return ipv6addresses.stream().findFirst(); String addresses = ipv6addresses.stream().map(InetAddresses::toAddrString).collect(Collectors.joining(",")); - throw new ConvergenceException( + throw ConvergenceException.ofError( String.format( "Multiple IPv6 addresses found: %s. Perhaps a missing DNS entry or multiple AAAA records in DNS?", addresses)); @@ -103,7 +103,7 @@ public interface IPAddresses { if (siteLocalIPv4Addresses.size() == 1) return Optional.of(siteLocalIPv4Addresses.get(0)); String addresses = ipv4Addresses.stream().map(InetAddresses::toAddrString).collect(Collectors.joining(",")); - throw new ConvergenceException( + throw ConvergenceException.ofError( String.format( "Multiple IPv4 addresses found: %s. Perhaps a missing DNS entry or multiple A records in DNS?", addresses)); diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index b13a3a184e5..e4c2f595164 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -334,7 +334,7 @@ public class NodeAgentImplTest { when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); when(containerOperations.pullImageAsyncIfNeeded(any(), eq(dockerImage), any())).thenReturn(false); - doThrow(new ConvergenceException("Connection refused")).doNothing() + doThrow(ConvergenceException.ofTransient("Connection refused")).doNothing() .when(healthChecker).verifyHealth(eq(context)); try { @@ -640,7 +640,7 @@ public class NodeAgentImplTest { InOrder inOrder = inOrder(orchestrator, containerOperations); - ConvergenceException healthCheckException = new ConvergenceException("Not yet up"); + ConvergenceException healthCheckException = ConvergenceException.ofTransient("Not yet up"); doThrow(healthCheckException).when(healthChecker).verifyHealth(any()); for (int i = 0; i < 3; i++) { try { -- cgit v1.2.3