summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorValerij Fredriksen <freva@users.noreply.github.com>2022-08-02 14:26:11 +0200
committerGitHub <noreply@github.com>2022-08-02 14:26:11 +0200
commitf4e21aefac14215826661298d1a980c90238af72 (patch)
treea68373b8ba70049c8987ffd9d1b86f7648d0b51a
parent52c0b28f9467e235279924cabba9f04b07f270e4 (diff)
parenta12ef29bb2649094d9f8647255c45ce7e0ca4ff7 (diff)
Merge pull request #23564 from vespa-engine/hakonhall/make-some-convergenceexception-increment-the-unhandledexceptions-metric
Make some ConvergenceException increment the unhandled_exceptions metric
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/ConnectionException.java2
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/HttpException.java9
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/noderepository/NodeRepositoryException.java6
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorException.java7
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java6
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorNotFoundException.java2
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java6
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java8
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java2
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ConvergenceException.java29
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java4
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java10
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/network/IPAddresses.java4
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java4
14 files changed, 60 insertions, 39 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/ConnectionException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/ConnectionException.java
index da93083a6de..c78d60e9950 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/ConnectionException.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/ConnectionException.java
@@ -15,7 +15,7 @@ import java.net.SocketTimeoutException;
public class ConnectionException extends ConvergenceException {
private ConnectionException(String message, Throwable cause) {
- super(message, cause);
+ super(message, cause, true);
}
/**
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/HttpException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/HttpException.java
index 4de7582fb7b..53e15b6c647 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/HttpException.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/HttpException.java
@@ -14,20 +14,15 @@ public class HttpException extends ConvergenceException {
private final boolean isRetryable;
private HttpException(int statusCode, String message, boolean isRetryable) {
- super("HTTP status code " + statusCode + ": " + message);
+ super("HTTP status code " + statusCode + ": " + message, null, !isRetryable);
this.isRetryable = isRetryable;
}
private HttpException(Response.Status status, String message, boolean isRetryable) {
- super(status.toString() + " (" + status.getStatusCode() + "): " + message);
+ super(status.toString() + " (" + status.getStatusCode() + "): " + message, null, !isRetryable);
this.isRetryable = isRetryable;
}
- private HttpException(String message) {
- super(message);
- this.isRetryable = false;
- }
-
boolean isRetryable() {
return isRetryable;
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/noderepository/NodeRepositoryException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/noderepository/NodeRepositoryException.java
index a0b0499bb1e..636813a2169 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/noderepository/NodeRepositoryException.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/noderepository/NodeRepositoryException.java
@@ -5,10 +5,6 @@ import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException;
public class NodeRepositoryException extends ConvergenceException {
public NodeRepositoryException(String message) {
- super(message);
- }
-
- public NodeRepositoryException(String message, Exception exception) {
- super(message, exception);
+ super(message, null, true);
}
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorException.java
index 9b8a749c33d..917b65b606c 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorException.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorException.java
@@ -5,7 +5,12 @@ import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException;
@SuppressWarnings("serial")
public class OrchestratorException extends ConvergenceException {
+ /** Creates a transient convergence exception. */
public OrchestratorException(String message) {
- super(message);
+ this(message, true);
+ }
+
+ protected OrchestratorException(String message, boolean isError) {
+ super(message, null, isError);
}
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java
index 281d43e0afc..858bce27ed8 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java
@@ -59,7 +59,7 @@ public class OrchestratorImpl implements Orchestrator {
} catch (HttpException e) {
throw new OrchestratorException("Failed to suspend " + hostName + ": " + e.toString());
} catch (ConnectionException e) {
- throw new ConvergenceException("Failed to suspend " + hostName + ": " + e.getMessage());
+ throw ConvergenceException.ofTransient("Failed to suspend " + hostName + ": " + e.getMessage());
} catch (RuntimeException e) {
throw new RuntimeException("Got error on suspend", e);
}
@@ -105,7 +105,7 @@ public class OrchestratorImpl implements Orchestrator {
} catch (HttpException e) {
throw new OrchestratorException("Failed to batch suspend for " + parentHostName + ": " + e.toString());
} catch (ConnectionException e) {
- throw new ConvergenceException("Failed to batch suspend for " + parentHostName + ": " + e.getMessage());
+ throw ConvergenceException.ofTransient("Failed to batch suspend for " + parentHostName + ": " + e.getMessage());
} catch (RuntimeException e) {
throw new RuntimeException("Got error on batch suspend for " + parentHostName + ", with nodes " + hostNames, e);
}
@@ -126,7 +126,7 @@ public class OrchestratorImpl implements Orchestrator {
} catch (HttpException e) {
throw new OrchestratorException("Failed to resume " + hostName + ": " + e.toString());
} catch (ConnectionException e) {
- throw new ConvergenceException("Failed to resume " + hostName + ": " + e.getMessage());
+ throw ConvergenceException.ofTransient("Failed to resume " + hostName + ": " + e.getMessage());
} catch (RuntimeException e) {
throw new RuntimeException("Got error on resume", e);
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorNotFoundException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorNotFoundException.java
index 3b016aac03f..a6a54807e56 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorNotFoundException.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorNotFoundException.java
@@ -4,6 +4,6 @@ package com.yahoo.vespa.hosted.node.admin.configserver.orchestrator;
@SuppressWarnings("serial")
public class OrchestratorNotFoundException extends OrchestratorException {
public OrchestratorNotFoundException(String message) {
- super(message);
+ super(message, true);
}
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
index c893f7ffee4..062a7ce018d 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
@@ -120,7 +120,7 @@ public class StorageMaintainer {
String[] results = output.split("\t");
if (results.length != 2) {
- throw new ConvergenceException("Result from disk usage command not as expected: " + output);
+ throw ConvergenceException.ofError("Result from disk usage command not as expected: " + output);
}
return DiskSize.of(Long.parseLong(results[0]), DiskSize.Unit.kiB);
@@ -226,11 +226,11 @@ public class StorageMaintainer {
String output = uncheck(() -> Files.readAllLines(Paths.get("/proc/cpuinfo")).stream()
.filter(line -> line.startsWith("microcode"))
.findFirst()
- .orElseThrow(() -> new ConvergenceException("No microcode information found in /proc/cpuinfo")));
+ .orElseThrow(() -> ConvergenceException.ofError("No microcode information found in /proc/cpuinfo")));
String[] results = output.split(":");
if (results.length != 2) {
- throw new ConvergenceException("Result from detect microcode command not as expected: " + output);
+ throw ConvergenceException.ofError("Result from detect microcode command not as expected: " + output);
}
return results[1].trim();
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java
index 45dbfd07209..0933f22dee3 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java
@@ -51,7 +51,7 @@ public class CoreCollector {
Matcher matcher = CORE_GENERATOR_PATH_PATTERN.matcher(result.getOutput());
if (! matcher.find()) {
- throw new ConvergenceException(String.format("Failed to extract binary path from GDB, result: %s, command: %s",
+ throw ConvergenceException.ofError(String.format("Failed to extract binary path from GDB, result: %s, command: %s",
asString(result), Arrays.toString(wrappedCommand)));
}
return matcher.group("path").split(" ")[0];
@@ -62,7 +62,7 @@ public class CoreCollector {
try {
CommandResult result = container.executeCommandInContainer(context, context.users().root(), command);
if (result.getExitCode() != 0) {
- throw new ConvergenceException("file command failed with " + asString(result));
+ throw ConvergenceException.ofError("file command failed with " + asString(result));
}
Matcher execfnMatcher = EXECFN_PATH_PATTERN.matcher(result.getOutput());
@@ -89,7 +89,7 @@ public class CoreCollector {
CommandResult result = container.executeCommandInContainer(context, context.users().root(), command);
if (result.getExitCode() != 0)
- throw new ConvergenceException("Failed to read backtrace " + asString(result) + ", Command: " + Arrays.toString(command));
+ throw ConvergenceException.ofError("Failed to read backtrace " + asString(result) + ", Command: " + Arrays.toString(command));
return List.of(result.getOutput().split("\n"));
}
@@ -99,7 +99,7 @@ public class CoreCollector {
CommandResult result = container.executeCommandInContainer(context, context.users().root(), command);
if (result.getExitCode() != 0)
- throw new ConvergenceException("Failed to read jstack " + asString(result) + ", Command: " + Arrays.toString(command));
+ throw ConvergenceException.ofError("Failed to read jstack " + asString(result) + ", Command: " + Arrays.toString(command));
return List.of(result.getOutput().split("\n"));
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java
index 6295765a95f..ece494a34d7 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java
@@ -95,7 +95,7 @@ public class CoredumpHandler {
.map(FileFinder.FileAttributes::filename)
.toList();
if (!pendingCores.isEmpty())
- throw new ConvergenceException(String.format("Cannot process %s coredumps: Still being written",
+ throw ConvergenceException.ofError(String.format("Cannot process %s coredumps: Still being written",
pendingCores.size() < 5 ? pendingCores : pendingCores.size()));
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ConvergenceException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ConvergenceException.java
index 16a5eb022ad..c1b86fc7fe2 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ConvergenceException.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ConvergenceException.java
@@ -8,11 +8,34 @@ package com.yahoo.vespa.hosted.node.admin.nodeadmin;
*/
@SuppressWarnings("serial")
public class ConvergenceException extends RuntimeException {
- public ConvergenceException(String message) {
- super(message);
+ /** Create an exception that will NOT increment the monitored unhandled_exceptions metric. */
+ public static ConvergenceException ofTransient(String message) { return ofTransient(message, null); }
+
+ /** Create an exception that will NOT increment the monitored unhandled_exceptions metric. */
+ public static ConvergenceException ofTransient(String message, Throwable t) { return new ConvergenceException(message, t, false); }
+
+ /** Create an exception that increments the monitored unhandled_exceptions metric. */
+ public static ConvergenceException ofError(String message) { return ofError(message, null); }
+
+ /** Create an exception that increments the monitored unhandled_exceptions metric. */
+ public static ConvergenceException ofError(String message, Throwable t) { return new ConvergenceException(message, t, true); }
+
+ /** Create an exception with the same transient/error as the cause. */
+ public static ConvergenceException ofNested(String message, ConvergenceException cause) { return new ConvergenceException(message, cause, cause.isError); }
+
+ private final boolean isError;
+
+ /** @param isError whether the exception should increment the monitored unhandled_exception metric. */
+ protected ConvergenceException(String message, boolean isError) {
+ this(message, null, isError);
}
- public ConvergenceException(String message, Throwable t) {
+ /** @param isError whether the exception should increment the monitored unhandled_exception metric. */
+ protected ConvergenceException(String message, Throwable t, boolean isError) {
super(message, t);
+ this.isError = isError;
}
+
+ /** Whether the exception signals an error someone may want to look at, or whether it is expected to be transient (false). */
+ public boolean isError() { return isError; }
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
index dda404797d9..314844dc6eb 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
@@ -110,7 +110,7 @@ public class NodeAdminStateUpdater {
if (hostIsActiveInNR) orchestrator.resume(hostHostname);
- throw new ConvergenceException("Timed out trying to freeze all nodes: will force an unfrozen tick");
+ throw ConvergenceException.ofTransient("Timed out trying to freeze all nodes: will force an unfrozen tick");
}
boolean wantFrozen = wantedState != RESUMED;
@@ -118,7 +118,7 @@ public class NodeAdminStateUpdater {
currentState = TRANSITIONING;
if (!nodeAdmin.setFrozen(wantFrozen))
- throw new ConvergenceException("NodeAdmin is not yet " + (wantFrozen ? "frozen" : "unfrozen"));
+ throw ConvergenceException.ofTransient("NodeAdmin is not yet " + (wantFrozen ? "frozen" : "unfrozen"));
switch (wantedState) {
case RESUMED:
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index 09bc58bdaa2..6408132fe0b 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -244,7 +244,7 @@ public class NodeAgentImpl implements NodeAgent {
hasResumedNode = false;
context.log(logger, "Container successfully started, new containerState is " + containerState);
return containerOperations.getContainer(context).orElseThrow(() ->
- new ConvergenceException("Did not find container that was just started"));
+ ConvergenceException.ofError("Did not find container that was just started"));
}
private Optional<Container> removeContainerIfNeededUpdateContainerState(
@@ -399,7 +399,7 @@ public class NodeAgentImpl implements NodeAgent {
// Only update CPU resources
containerOperations.updateContainer(context, existingContainer.id(), wantedContainerResources.withMemoryBytes(existingContainer.resources().memoryBytes()));
return containerOperations.getContainer(context).orElseThrow(() ->
- new ConvergenceException("Did not find container that was just updated"));
+ ConvergenceException.ofError("Did not find container that was just updated"));
}
private ContainerResources getContainerResources(NodeAgentContext context) {
@@ -435,6 +435,8 @@ public class NodeAgentImpl implements NodeAgent {
context.log(logger, Level.INFO, "Converged");
} catch (ConvergenceException e) {
context.log(logger, e.getMessage());
+ if (e.isError())
+ numberOfUnhandledException++;
} catch (Throwable e) {
numberOfUnhandledException++;
context.log(logger, Level.SEVERE, "Unhandled exception, ignoring", e);
@@ -501,7 +503,7 @@ public class NodeAgentImpl implements NodeAgent {
Duration timeLeft = Duration.between(clock.instant(), firstSuccessfulHealthCheckInstant.get().plus(warmUpDuration(context)));
if (!container.get().resources().equalsCpu(getContainerResources(context)))
- throw new ConvergenceException("Refusing to resume until warm up period ends (" +
+ throw ConvergenceException.ofTransient("Refusing to resume until warm up period ends (" +
(timeLeft.isNegative() ? "next tick" : "in " + timeLeft) + ")");
}
serviceDumper.processServiceDumpRequest(context);
@@ -536,7 +538,7 @@ public class NodeAgentImpl implements NodeAgent {
nodeRepository.setNodeState(context.hostname().value(), NodeState.ready);
break;
default:
- throw new ConvergenceException("UNKNOWN STATE " + node.state().name());
+ throw ConvergenceException.ofError("UNKNOWN STATE " + node.state().name());
}
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/network/IPAddresses.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/network/IPAddresses.java
index 07e172c5117..148d80c9803 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/network/IPAddresses.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/network/IPAddresses.java
@@ -67,7 +67,7 @@ public interface IPAddresses {
if (ipv6addresses.size() <= 1) return ipv6addresses.stream().findFirst();
String addresses = ipv6addresses.stream().map(InetAddresses::toAddrString).collect(Collectors.joining(","));
- throw new ConvergenceException(
+ throw ConvergenceException.ofError(
String.format(
"Multiple IPv6 addresses found: %s. Perhaps a missing DNS entry or multiple AAAA records in DNS?",
addresses));
@@ -103,7 +103,7 @@ public interface IPAddresses {
if (siteLocalIPv4Addresses.size() == 1) return Optional.of(siteLocalIPv4Addresses.get(0));
String addresses = ipv4Addresses.stream().map(InetAddresses::toAddrString).collect(Collectors.joining(","));
- throw new ConvergenceException(
+ throw ConvergenceException.ofError(
String.format(
"Multiple IPv4 addresses found: %s. Perhaps a missing DNS entry or multiple A records in DNS?",
addresses));
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
index b13a3a184e5..e4c2f595164 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
@@ -334,7 +334,7 @@ public class NodeAgentImplTest {
when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node));
when(containerOperations.pullImageAsyncIfNeeded(any(), eq(dockerImage), any())).thenReturn(false);
- doThrow(new ConvergenceException("Connection refused")).doNothing()
+ doThrow(ConvergenceException.ofTransient("Connection refused")).doNothing()
.when(healthChecker).verifyHealth(eq(context));
try {
@@ -640,7 +640,7 @@ public class NodeAgentImplTest {
InOrder inOrder = inOrder(orchestrator, containerOperations);
- ConvergenceException healthCheckException = new ConvergenceException("Not yet up");
+ ConvergenceException healthCheckException = ConvergenceException.ofTransient("Not yet up");
doThrow(healthCheckException).when(healthChecker).verifyHealth(any());
for (int i = 0; i < 3; i++) {
try {