aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorErlend <erlendniko@hotmail.com>2022-08-03 11:28:48 +0200
committerErlend <erlendniko@hotmail.com>2022-08-03 11:28:48 +0200
commit2cc3d2edc707c5c5043ffc2654659afbe25f9130 (patch)
tree50afa622d2864b0160696c4d813d0edc811637f5
parent3cdafb828b28283e03c7fb1299165eba31e74675 (diff)
parent8af644212e91df05d2a33540ebb4e90b149b1624 (diff)
Merge remote-tracking branch 'upstream/master'
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java2
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainer.java26
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/billing/BillingApiHandlerV2.java9
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainerTest.java28
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/billing/BillingApiHandlerV2Test.java2
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/ConnectionException.java2
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/HttpException.java9
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/noderepository/NodeRepositoryException.java6
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorException.java7
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java6
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorNotFoundException.java2
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java6
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java8
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java2
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ConvergenceException.java29
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java4
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java10
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/network/IPAddresses.java4
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java4
19 files changed, 118 insertions, 48 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java
index 9793cded918..a259ed2fdef 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java
@@ -156,7 +156,7 @@ public class ControllerMaintenance extends AbstractComponent {
this.containerImageExpirer = duration(12, HOURS);
this.hostInfoUpdater = duration(12, HOURS);
this.reindexingTriggerer = duration(1, HOURS);
- this.endpointCertificateMaintainer = duration(12, HOURS);
+ this.endpointCertificateMaintainer = duration(1, HOURS);
this.trafficFractionUpdater = duration(5, MINUTES);
this.archiveUriUpdater = duration(5, MINUTES);
this.archiveAccessMaintainer = duration(10, MINUTES);
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainer.java
index f3256237284..2e2680cd34a 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainer.java
@@ -7,7 +7,6 @@ import com.yahoo.config.provision.ApplicationId;
import com.yahoo.container.jdisc.secretstore.SecretNotFoundException;
import com.yahoo.container.jdisc.secretstore.SecretStore;
import com.yahoo.transaction.Mutex;
-import com.yahoo.vespa.curator.Lock;
import com.yahoo.vespa.hosted.controller.Controller;
import com.yahoo.vespa.hosted.controller.Instance;
import com.yahoo.vespa.hosted.controller.api.integration.certificates.EndpointCertificateDetails;
@@ -15,6 +14,7 @@ import com.yahoo.vespa.hosted.controller.api.integration.certificates.EndpointCe
import com.yahoo.vespa.hosted.controller.api.integration.certificates.EndpointCertificateProvider;
import com.yahoo.vespa.hosted.controller.api.integration.certificates.EndpointCertificateRequestMetadata;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType;
+import com.yahoo.vespa.hosted.controller.application.Deployment;
import com.yahoo.vespa.hosted.controller.application.TenantAndApplicationId;
import com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger;
import com.yahoo.vespa.hosted.controller.persistence.CuratorDb;
@@ -23,6 +23,8 @@ import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
import java.time.temporal.ChronoUnit;
+import java.util.ArrayList;
+import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
@@ -48,6 +50,7 @@ public class EndpointCertificateMaintainer extends ControllerMaintainer {
private final CuratorDb curator;
private final SecretStore secretStore;
private final EndpointCertificateProvider endpointCertificateProvider;
+ final Comparator<EligibleJob> oldestFirst = Comparator.comparing(e -> e.deployment.at());
@Inject
public EndpointCertificateMaintainer(Controller controller, Duration interval) {
@@ -92,11 +95,14 @@ public class EndpointCertificateMaintainer extends ControllerMaintainer {
}));
}
+ record EligibleJob(Deployment deployment, ApplicationId applicationId, JobType job) {}
/**
- * If it's been four days since the cert has been refreshed, re-trigger all prod deployment jobs.
+ * If it's been four days since the cert has been refreshed, re-trigger prod deployment jobs (one at a time).
*/
private void deployRefreshedCertificates() {
var now = clock.instant();
+ var eligibleJobs = new ArrayList<EligibleJob>();
+
curator.readAllEndpointCertificateMetadata().forEach((applicationId, endpointCertificateMetadata) ->
endpointCertificateMetadata.lastRefreshed().ifPresent(lastRefreshTime -> {
Instant refreshTime = Instant.ofEpochSecond(lastRefreshTime);
@@ -105,13 +111,19 @@ public class EndpointCertificateMaintainer extends ControllerMaintainer {
.ifPresent(instance -> instance.productionDeployments().forEach((zone, deployment) -> {
if (deployment.at().isBefore(refreshTime)) {
JobType job = JobType.deploymentTo(zone);
- deploymentTrigger.reTrigger(applicationId, job, "re-triggered by EndpointCertificateMaintainer");
- log.info("Re-triggering deployment job " + job.jobName() + " for instance " +
- applicationId.serializedForm() + " to roll out refreshed endpoint certificate");
+ eligibleJobs.add(new EligibleJob(deployment, applicationId, job));
}
}));
}
}));
+
+ eligibleJobs.stream()
+ .min(oldestFirst)
+ .ifPresent(e -> {
+ deploymentTrigger.reTrigger(e.applicationId, e.job, "re-triggered by EndpointCertificateMaintainer");
+ log.info("Re-triggering deployment job " + e.job.jobName() + " for instance " +
+ e.applicationId.serializedForm() + " to roll out refreshed endpoint certificate");
+ });
}
private OptionalInt latestVersionInSecretStore(EndpointCertificateMetadata originalCertificateMetadata) {
@@ -156,8 +168,8 @@ public class EndpointCertificateMaintainer extends ControllerMaintainer {
List<EndpointCertificateRequestMetadata> endpointCertificateMetadata = endpointCertificateProvider.listCertificates();
Map<ApplicationId, EndpointCertificateMetadata> storedEndpointCertificateMetadata = curator.readAllEndpointCertificateMetadata();
- List<String> leafRequestIds = storedEndpointCertificateMetadata.values().stream().flatMap(m -> m.leafRequestId().stream()).collect(Collectors.toList());
- List<String> rootRequestIds = storedEndpointCertificateMetadata.values().stream().map(EndpointCertificateMetadata::rootRequestId).collect(Collectors.toList());
+ List<String> leafRequestIds = storedEndpointCertificateMetadata.values().stream().flatMap(m -> m.leafRequestId().stream()).toList();
+ List<String> rootRequestIds = storedEndpointCertificateMetadata.values().stream().map(EndpointCertificateMetadata::rootRequestId).toList();
for (var providerCertificateMetadata : endpointCertificateMetadata) {
if (!rootRequestIds.contains(providerCertificateMetadata.requestId()) && !leafRequestIds.contains(providerCertificateMetadata.requestId())) {
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/billing/BillingApiHandlerV2.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/billing/BillingApiHandlerV2.java
index 44a8b636ae0..4532e0c2c18 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/billing/BillingApiHandlerV2.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/billing/BillingApiHandlerV2.java
@@ -14,6 +14,7 @@ import com.yahoo.slime.Cursor;
import com.yahoo.slime.Inspector;
import com.yahoo.slime.Slime;
import com.yahoo.slime.Type;
+import com.yahoo.vespa.hosted.controller.Application;
import com.yahoo.vespa.hosted.controller.ApplicationController;
import com.yahoo.vespa.hosted.controller.Controller;
import com.yahoo.vespa.hosted.controller.TenantController;
@@ -23,8 +24,10 @@ import com.yahoo.vespa.hosted.controller.api.integration.billing.CollectionMetho
import com.yahoo.vespa.hosted.controller.api.integration.billing.Plan;
import com.yahoo.vespa.hosted.controller.api.integration.billing.PlanId;
import com.yahoo.vespa.hosted.controller.api.integration.billing.PlanRegistry;
+import com.yahoo.vespa.hosted.controller.api.integration.billing.Quota;
import com.yahoo.vespa.hosted.controller.api.role.Role;
import com.yahoo.vespa.hosted.controller.api.role.SecurityContext;
+import com.yahoo.vespa.hosted.controller.application.QuotaUsage;
import com.yahoo.vespa.hosted.controller.tenant.CloudTenant;
import com.yahoo.vespa.hosted.controller.tenant.Tenant;
@@ -200,11 +203,13 @@ public class BillingApiHandlerV2 extends RestApiRequestHandler<BillingApiHandler
var response = new Slime();
var tenantsResponse = response.setObject().setArray("tenants");
+
tenants.asList().stream().sorted(Comparator.comparing(Tenant::name)).forEach(tenant -> {
var usage = Optional.ofNullable(usagePerTenant.get(tenant.name()));
var tenantResponse = tenantsResponse.addObject();
tenantResponse.setString("tenant", tenant.name().value());
toSlime(tenantResponse.setObject("plan"), planFor(tenant.name()));
+ toSlime(tenantResponse.setObject("quota"), billing.getQuota(tenant.name()));
tenantResponse.setString("collection", billing.getCollectionMethod(tenant.name()).name());
tenantResponse.setString("lastBill", usage.map(Bill::getStartDate).map(DateTimeFormatter.ISO_DATE::format).orElse(null));
tenantResponse.setString("unbilled", usage.map(Bill::sum).map(BigDecimal::toPlainString).orElse("0.00"));
@@ -357,6 +362,10 @@ public class BillingApiHandlerV2 extends RestApiRequestHandler<BillingApiHandler
cursor.setString("name", plan.displayName());
}
+ private void toSlime(Cursor cursor, Quota quota) {
+ cursor.setDouble("budget", quota.budget().map(BigDecimal::doubleValue).orElse(-1.0));
+ }
+
private Plan planFor(TenantName tenant) {
var planId = billing.getPlan(tenant);
return planRegistry.plan(planId)
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainerTest.java
index c103894b1a3..47a1b44d196 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainerTest.java
@@ -1,19 +1,32 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.controller.maintenance;
+import com.yahoo.component.Version;
import com.yahoo.config.provision.ApplicationId;
+import com.yahoo.config.provision.zone.ZoneId;
import com.yahoo.vespa.hosted.controller.ControllerTester;
import com.yahoo.vespa.hosted.controller.api.integration.certificates.EndpointCertificateMetadata;
import com.yahoo.vespa.hosted.controller.api.integration.certificates.EndpointCertificateMock;
+import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType;
+import com.yahoo.vespa.hosted.controller.api.integration.deployment.RevisionId;
+import com.yahoo.vespa.hosted.controller.application.Deployment;
+import com.yahoo.vespa.hosted.controller.application.DeploymentActivity;
+import com.yahoo.vespa.hosted.controller.application.DeploymentMetrics;
+import com.yahoo.vespa.hosted.controller.application.QuotaUsage;
import com.yahoo.vespa.hosted.controller.deployment.ApplicationPackageBuilder;
import com.yahoo.vespa.hosted.controller.deployment.DeploymentContext;
import com.yahoo.vespa.hosted.controller.deployment.DeploymentTester;
import com.yahoo.vespa.hosted.controller.integration.SecretStoreMock;
+import org.jetbrains.annotations.NotNull;
import org.junit.jupiter.api.Test;
import java.time.Duration;
+import java.time.Instant;
+import java.time.temporal.ChronoUnit;
import java.util.List;
import java.util.Optional;
+import java.util.OptionalDouble;
+import java.util.stream.Stream;
import static com.yahoo.vespa.hosted.controller.deployment.DeploymentContext.productionUsWest1;
import static com.yahoo.vespa.hosted.controller.deployment.DeploymentContext.stagingTest;
@@ -121,6 +134,21 @@ public class EndpointCertificateMaintainerTest {
}
@Test
+ void testEligibleSorting() {
+ EndpointCertificateMaintainer.EligibleJob oldestDeployment = makeDeploymentAtAge(5);
+ assertEquals(
+ oldestDeployment,
+ Stream.of(makeDeploymentAtAge(2), oldestDeployment, makeDeploymentAtAge(4)).min(maintainer.oldestFirst).get());
+ }
+
+ @NotNull
+ private EndpointCertificateMaintainer.EligibleJob makeDeploymentAtAge(int ageInDays) {
+ var deployment = new Deployment(ZoneId.defaultId(), RevisionId.forProduction(1), Version.emptyVersion,
+ Instant.now().minus(ageInDays, ChronoUnit.DAYS), DeploymentMetrics.none, DeploymentActivity.none, QuotaUsage.none, OptionalDouble.empty());
+ return new EndpointCertificateMaintainer.EligibleJob(deployment, ApplicationId.defaultId(), JobType.prod("somewhere"));
+ }
+
+ @Test
void unmaintained_cert_is_deleted() {
EndpointCertificateMock endpointCertificateProvider = (EndpointCertificateMock) tester.controller().serviceRegistry().endpointCertificateProvider();
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/billing/BillingApiHandlerV2Test.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/billing/BillingApiHandlerV2Test.java
index 7c3f0e51b92..c62a9f1399f 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/billing/BillingApiHandlerV2Test.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/billing/BillingApiHandlerV2Test.java
@@ -124,7 +124,7 @@ public class BillingApiHandlerV2Test extends ControllerContainerCloudTest {
var accountantRequest = request("/billing/v2/accountant").roles(Role.hostedAccountant());
tester.assertResponse(accountantRequest, """
- {"tenants":[{"tenant":"tenant1","plan":{"id":"trial","name":"Free Trial - for testing purposes"},"collection":"AUTO","lastBill":null,"unbilled":"0.00"}]}""");
+ {"tenants":[{"tenant":"tenant1","plan":{"id":"trial","name":"Free Trial - for testing purposes"},"quota":{"budget":-1.0},"collection":"AUTO","lastBill":null,"unbilled":"0.00"}]}""");
}
@Test
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/ConnectionException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/ConnectionException.java
index da93083a6de..c78d60e9950 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/ConnectionException.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/ConnectionException.java
@@ -15,7 +15,7 @@ import java.net.SocketTimeoutException;
public class ConnectionException extends ConvergenceException {
private ConnectionException(String message, Throwable cause) {
- super(message, cause);
+ super(message, cause, true);
}
/**
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/HttpException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/HttpException.java
index 4de7582fb7b..53e15b6c647 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/HttpException.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/HttpException.java
@@ -14,20 +14,15 @@ public class HttpException extends ConvergenceException {
private final boolean isRetryable;
private HttpException(int statusCode, String message, boolean isRetryable) {
- super("HTTP status code " + statusCode + ": " + message);
+ super("HTTP status code " + statusCode + ": " + message, null, !isRetryable);
this.isRetryable = isRetryable;
}
private HttpException(Response.Status status, String message, boolean isRetryable) {
- super(status.toString() + " (" + status.getStatusCode() + "): " + message);
+ super(status.toString() + " (" + status.getStatusCode() + "): " + message, null, !isRetryable);
this.isRetryable = isRetryable;
}
- private HttpException(String message) {
- super(message);
- this.isRetryable = false;
- }
-
boolean isRetryable() {
return isRetryable;
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/noderepository/NodeRepositoryException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/noderepository/NodeRepositoryException.java
index a0b0499bb1e..636813a2169 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/noderepository/NodeRepositoryException.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/noderepository/NodeRepositoryException.java
@@ -5,10 +5,6 @@ import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException;
public class NodeRepositoryException extends ConvergenceException {
public NodeRepositoryException(String message) {
- super(message);
- }
-
- public NodeRepositoryException(String message, Exception exception) {
- super(message, exception);
+ super(message, null, true);
}
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorException.java
index 9b8a749c33d..917b65b606c 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorException.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorException.java
@@ -5,7 +5,12 @@ import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException;
@SuppressWarnings("serial")
public class OrchestratorException extends ConvergenceException {
+ /** Creates a transient convergence exception. */
public OrchestratorException(String message) {
- super(message);
+ this(message, true);
+ }
+
+ protected OrchestratorException(String message, boolean isError) {
+ super(message, null, isError);
}
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java
index 281d43e0afc..858bce27ed8 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java
@@ -59,7 +59,7 @@ public class OrchestratorImpl implements Orchestrator {
} catch (HttpException e) {
throw new OrchestratorException("Failed to suspend " + hostName + ": " + e.toString());
} catch (ConnectionException e) {
- throw new ConvergenceException("Failed to suspend " + hostName + ": " + e.getMessage());
+ throw ConvergenceException.ofTransient("Failed to suspend " + hostName + ": " + e.getMessage());
} catch (RuntimeException e) {
throw new RuntimeException("Got error on suspend", e);
}
@@ -105,7 +105,7 @@ public class OrchestratorImpl implements Orchestrator {
} catch (HttpException e) {
throw new OrchestratorException("Failed to batch suspend for " + parentHostName + ": " + e.toString());
} catch (ConnectionException e) {
- throw new ConvergenceException("Failed to batch suspend for " + parentHostName + ": " + e.getMessage());
+ throw ConvergenceException.ofTransient("Failed to batch suspend for " + parentHostName + ": " + e.getMessage());
} catch (RuntimeException e) {
throw new RuntimeException("Got error on batch suspend for " + parentHostName + ", with nodes " + hostNames, e);
}
@@ -126,7 +126,7 @@ public class OrchestratorImpl implements Orchestrator {
} catch (HttpException e) {
throw new OrchestratorException("Failed to resume " + hostName + ": " + e.toString());
} catch (ConnectionException e) {
- throw new ConvergenceException("Failed to resume " + hostName + ": " + e.getMessage());
+ throw ConvergenceException.ofTransient("Failed to resume " + hostName + ": " + e.getMessage());
} catch (RuntimeException e) {
throw new RuntimeException("Got error on resume", e);
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorNotFoundException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorNotFoundException.java
index 3b016aac03f..a6a54807e56 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorNotFoundException.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorNotFoundException.java
@@ -4,6 +4,6 @@ package com.yahoo.vespa.hosted.node.admin.configserver.orchestrator;
@SuppressWarnings("serial")
public class OrchestratorNotFoundException extends OrchestratorException {
public OrchestratorNotFoundException(String message) {
- super(message);
+ super(message, true);
}
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
index c893f7ffee4..062a7ce018d 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
@@ -120,7 +120,7 @@ public class StorageMaintainer {
String[] results = output.split("\t");
if (results.length != 2) {
- throw new ConvergenceException("Result from disk usage command not as expected: " + output);
+ throw ConvergenceException.ofError("Result from disk usage command not as expected: " + output);
}
return DiskSize.of(Long.parseLong(results[0]), DiskSize.Unit.kiB);
@@ -226,11 +226,11 @@ public class StorageMaintainer {
String output = uncheck(() -> Files.readAllLines(Paths.get("/proc/cpuinfo")).stream()
.filter(line -> line.startsWith("microcode"))
.findFirst()
- .orElseThrow(() -> new ConvergenceException("No microcode information found in /proc/cpuinfo")));
+ .orElseThrow(() -> ConvergenceException.ofError("No microcode information found in /proc/cpuinfo")));
String[] results = output.split(":");
if (results.length != 2) {
- throw new ConvergenceException("Result from detect microcode command not as expected: " + output);
+ throw ConvergenceException.ofError("Result from detect microcode command not as expected: " + output);
}
return results[1].trim();
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java
index 45dbfd07209..0933f22dee3 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java
@@ -51,7 +51,7 @@ public class CoreCollector {
Matcher matcher = CORE_GENERATOR_PATH_PATTERN.matcher(result.getOutput());
if (! matcher.find()) {
- throw new ConvergenceException(String.format("Failed to extract binary path from GDB, result: %s, command: %s",
+ throw ConvergenceException.ofError(String.format("Failed to extract binary path from GDB, result: %s, command: %s",
asString(result), Arrays.toString(wrappedCommand)));
}
return matcher.group("path").split(" ")[0];
@@ -62,7 +62,7 @@ public class CoreCollector {
try {
CommandResult result = container.executeCommandInContainer(context, context.users().root(), command);
if (result.getExitCode() != 0) {
- throw new ConvergenceException("file command failed with " + asString(result));
+ throw ConvergenceException.ofError("file command failed with " + asString(result));
}
Matcher execfnMatcher = EXECFN_PATH_PATTERN.matcher(result.getOutput());
@@ -89,7 +89,7 @@ public class CoreCollector {
CommandResult result = container.executeCommandInContainer(context, context.users().root(), command);
if (result.getExitCode() != 0)
- throw new ConvergenceException("Failed to read backtrace " + asString(result) + ", Command: " + Arrays.toString(command));
+ throw ConvergenceException.ofError("Failed to read backtrace " + asString(result) + ", Command: " + Arrays.toString(command));
return List.of(result.getOutput().split("\n"));
}
@@ -99,7 +99,7 @@ public class CoreCollector {
CommandResult result = container.executeCommandInContainer(context, context.users().root(), command);
if (result.getExitCode() != 0)
- throw new ConvergenceException("Failed to read jstack " + asString(result) + ", Command: " + Arrays.toString(command));
+ throw ConvergenceException.ofError("Failed to read jstack " + asString(result) + ", Command: " + Arrays.toString(command));
return List.of(result.getOutput().split("\n"));
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java
index 6295765a95f..ece494a34d7 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java
@@ -95,7 +95,7 @@ public class CoredumpHandler {
.map(FileFinder.FileAttributes::filename)
.toList();
if (!pendingCores.isEmpty())
- throw new ConvergenceException(String.format("Cannot process %s coredumps: Still being written",
+ throw ConvergenceException.ofError(String.format("Cannot process %s coredumps: Still being written",
pendingCores.size() < 5 ? pendingCores : pendingCores.size()));
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ConvergenceException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ConvergenceException.java
index 16a5eb022ad..c1b86fc7fe2 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ConvergenceException.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ConvergenceException.java
@@ -8,11 +8,34 @@ package com.yahoo.vespa.hosted.node.admin.nodeadmin;
*/
@SuppressWarnings("serial")
public class ConvergenceException extends RuntimeException {
- public ConvergenceException(String message) {
- super(message);
+ /** Create an exception that will NOT increment the monitored unhandled_exceptions metric. */
+ public static ConvergenceException ofTransient(String message) { return ofTransient(message, null); }
+
+ /** Create an exception that will NOT increment the monitored unhandled_exceptions metric. */
+ public static ConvergenceException ofTransient(String message, Throwable t) { return new ConvergenceException(message, t, false); }
+
+ /** Create an exception that increments the monitored unhandled_exceptions metric. */
+ public static ConvergenceException ofError(String message) { return ofError(message, null); }
+
+ /** Create an exception that increments the monitored unhandled_exceptions metric. */
+ public static ConvergenceException ofError(String message, Throwable t) { return new ConvergenceException(message, t, true); }
+
+ /** Create an exception with the same transient/error as the cause. */
+ public static ConvergenceException ofNested(String message, ConvergenceException cause) { return new ConvergenceException(message, cause, cause.isError); }
+
+ private final boolean isError;
+
+ /** @param isError whether the exception should increment the monitored unhandled_exception metric. */
+ protected ConvergenceException(String message, boolean isError) {
+ this(message, null, isError);
}
- public ConvergenceException(String message, Throwable t) {
+ /** @param isError whether the exception should increment the monitored unhandled_exception metric. */
+ protected ConvergenceException(String message, Throwable t, boolean isError) {
super(message, t);
+ this.isError = isError;
}
+
+ /** Whether the exception signals an error someone may want to look at, or whether it is expected to be transient (false). */
+ public boolean isError() { return isError; }
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
index dda404797d9..314844dc6eb 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
@@ -110,7 +110,7 @@ public class NodeAdminStateUpdater {
if (hostIsActiveInNR) orchestrator.resume(hostHostname);
- throw new ConvergenceException("Timed out trying to freeze all nodes: will force an unfrozen tick");
+ throw ConvergenceException.ofTransient("Timed out trying to freeze all nodes: will force an unfrozen tick");
}
boolean wantFrozen = wantedState != RESUMED;
@@ -118,7 +118,7 @@ public class NodeAdminStateUpdater {
currentState = TRANSITIONING;
if (!nodeAdmin.setFrozen(wantFrozen))
- throw new ConvergenceException("NodeAdmin is not yet " + (wantFrozen ? "frozen" : "unfrozen"));
+ throw ConvergenceException.ofTransient("NodeAdmin is not yet " + (wantFrozen ? "frozen" : "unfrozen"));
switch (wantedState) {
case RESUMED:
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index 09bc58bdaa2..6408132fe0b 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -244,7 +244,7 @@ public class NodeAgentImpl implements NodeAgent {
hasResumedNode = false;
context.log(logger, "Container successfully started, new containerState is " + containerState);
return containerOperations.getContainer(context).orElseThrow(() ->
- new ConvergenceException("Did not find container that was just started"));
+ ConvergenceException.ofError("Did not find container that was just started"));
}
private Optional<Container> removeContainerIfNeededUpdateContainerState(
@@ -399,7 +399,7 @@ public class NodeAgentImpl implements NodeAgent {
// Only update CPU resources
containerOperations.updateContainer(context, existingContainer.id(), wantedContainerResources.withMemoryBytes(existingContainer.resources().memoryBytes()));
return containerOperations.getContainer(context).orElseThrow(() ->
- new ConvergenceException("Did not find container that was just updated"));
+ ConvergenceException.ofError("Did not find container that was just updated"));
}
private ContainerResources getContainerResources(NodeAgentContext context) {
@@ -435,6 +435,8 @@ public class NodeAgentImpl implements NodeAgent {
context.log(logger, Level.INFO, "Converged");
} catch (ConvergenceException e) {
context.log(logger, e.getMessage());
+ if (e.isError())
+ numberOfUnhandledException++;
} catch (Throwable e) {
numberOfUnhandledException++;
context.log(logger, Level.SEVERE, "Unhandled exception, ignoring", e);
@@ -501,7 +503,7 @@ public class NodeAgentImpl implements NodeAgent {
Duration timeLeft = Duration.between(clock.instant(), firstSuccessfulHealthCheckInstant.get().plus(warmUpDuration(context)));
if (!container.get().resources().equalsCpu(getContainerResources(context)))
- throw new ConvergenceException("Refusing to resume until warm up period ends (" +
+ throw ConvergenceException.ofTransient("Refusing to resume until warm up period ends (" +
(timeLeft.isNegative() ? "next tick" : "in " + timeLeft) + ")");
}
serviceDumper.processServiceDumpRequest(context);
@@ -536,7 +538,7 @@ public class NodeAgentImpl implements NodeAgent {
nodeRepository.setNodeState(context.hostname().value(), NodeState.ready);
break;
default:
- throw new ConvergenceException("UNKNOWN STATE " + node.state().name());
+ throw ConvergenceException.ofError("UNKNOWN STATE " + node.state().name());
}
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/network/IPAddresses.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/network/IPAddresses.java
index 07e172c5117..148d80c9803 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/network/IPAddresses.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/network/IPAddresses.java
@@ -67,7 +67,7 @@ public interface IPAddresses {
if (ipv6addresses.size() <= 1) return ipv6addresses.stream().findFirst();
String addresses = ipv6addresses.stream().map(InetAddresses::toAddrString).collect(Collectors.joining(","));
- throw new ConvergenceException(
+ throw ConvergenceException.ofError(
String.format(
"Multiple IPv6 addresses found: %s. Perhaps a missing DNS entry or multiple AAAA records in DNS?",
addresses));
@@ -103,7 +103,7 @@ public interface IPAddresses {
if (siteLocalIPv4Addresses.size() == 1) return Optional.of(siteLocalIPv4Addresses.get(0));
String addresses = ipv4Addresses.stream().map(InetAddresses::toAddrString).collect(Collectors.joining(","));
- throw new ConvergenceException(
+ throw ConvergenceException.ofError(
String.format(
"Multiple IPv4 addresses found: %s. Perhaps a missing DNS entry or multiple A records in DNS?",
addresses));
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
index b13a3a184e5..e4c2f595164 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
@@ -334,7 +334,7 @@ public class NodeAgentImplTest {
when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node));
when(containerOperations.pullImageAsyncIfNeeded(any(), eq(dockerImage), any())).thenReturn(false);
- doThrow(new ConvergenceException("Connection refused")).doNothing()
+ doThrow(ConvergenceException.ofTransient("Connection refused")).doNothing()
.when(healthChecker).verifyHealth(eq(context));
try {
@@ -640,7 +640,7 @@ public class NodeAgentImplTest {
InOrder inOrder = inOrder(orchestrator, containerOperations);
- ConvergenceException healthCheckException = new ConvergenceException("Not yet up");
+ ConvergenceException healthCheckException = ConvergenceException.ofTransient("Not yet up");
doThrow(healthCheckException).when(healthChecker).verifyHealth(any());
for (int i = 0; i < 3; i++) {
try {