diff options
author | Erlend <erlendniko@hotmail.com> | 2022-08-03 11:28:48 +0200 |
---|---|---|
committer | Erlend <erlendniko@hotmail.com> | 2022-08-03 11:28:48 +0200 |
commit | 2cc3d2edc707c5c5043ffc2654659afbe25f9130 (patch) | |
tree | 50afa622d2864b0160696c4d813d0edc811637f5 | |
parent | 3cdafb828b28283e03c7fb1299165eba31e74675 (diff) | |
parent | 8af644212e91df05d2a33540ebb4e90b149b1624 (diff) |
Merge remote-tracking branch 'upstream/master'
19 files changed, 118 insertions, 48 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java index 9793cded918..a259ed2fdef 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java @@ -156,7 +156,7 @@ public class ControllerMaintenance extends AbstractComponent { this.containerImageExpirer = duration(12, HOURS); this.hostInfoUpdater = duration(12, HOURS); this.reindexingTriggerer = duration(1, HOURS); - this.endpointCertificateMaintainer = duration(12, HOURS); + this.endpointCertificateMaintainer = duration(1, HOURS); this.trafficFractionUpdater = duration(5, MINUTES); this.archiveUriUpdater = duration(5, MINUTES); this.archiveAccessMaintainer = duration(10, MINUTES); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainer.java index f3256237284..2e2680cd34a 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainer.java @@ -7,7 +7,6 @@ import com.yahoo.config.provision.ApplicationId; import com.yahoo.container.jdisc.secretstore.SecretNotFoundException; import com.yahoo.container.jdisc.secretstore.SecretStore; import com.yahoo.transaction.Mutex; -import com.yahoo.vespa.curator.Lock; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.Instance; import com.yahoo.vespa.hosted.controller.api.integration.certificates.EndpointCertificateDetails; @@ -15,6 +14,7 @@ import com.yahoo.vespa.hosted.controller.api.integration.certificates.EndpointCe import com.yahoo.vespa.hosted.controller.api.integration.certificates.EndpointCertificateProvider; import com.yahoo.vespa.hosted.controller.api.integration.certificates.EndpointCertificateRequestMetadata; import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType; +import com.yahoo.vespa.hosted.controller.application.Deployment; import com.yahoo.vespa.hosted.controller.application.TenantAndApplicationId; import com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger; import com.yahoo.vespa.hosted.controller.persistence.CuratorDb; @@ -23,6 +23,8 @@ import java.time.Clock; import java.time.Duration; import java.time.Instant; import java.time.temporal.ChronoUnit; +import java.util.ArrayList; +import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -48,6 +50,7 @@ public class EndpointCertificateMaintainer extends ControllerMaintainer { private final CuratorDb curator; private final SecretStore secretStore; private final EndpointCertificateProvider endpointCertificateProvider; + final Comparator<EligibleJob> oldestFirst = Comparator.comparing(e -> e.deployment.at()); @Inject public EndpointCertificateMaintainer(Controller controller, Duration interval) { @@ -92,11 +95,14 @@ public class EndpointCertificateMaintainer extends ControllerMaintainer { })); } + record EligibleJob(Deployment deployment, ApplicationId applicationId, JobType job) {} /** - * If it's been four days since the cert has been refreshed, re-trigger all prod deployment jobs. + * If it's been four days since the cert has been refreshed, re-trigger prod deployment jobs (one at a time). */ private void deployRefreshedCertificates() { var now = clock.instant(); + var eligibleJobs = new ArrayList<EligibleJob>(); + curator.readAllEndpointCertificateMetadata().forEach((applicationId, endpointCertificateMetadata) -> endpointCertificateMetadata.lastRefreshed().ifPresent(lastRefreshTime -> { Instant refreshTime = Instant.ofEpochSecond(lastRefreshTime); @@ -105,13 +111,19 @@ public class EndpointCertificateMaintainer extends ControllerMaintainer { .ifPresent(instance -> instance.productionDeployments().forEach((zone, deployment) -> { if (deployment.at().isBefore(refreshTime)) { JobType job = JobType.deploymentTo(zone); - deploymentTrigger.reTrigger(applicationId, job, "re-triggered by EndpointCertificateMaintainer"); - log.info("Re-triggering deployment job " + job.jobName() + " for instance " + - applicationId.serializedForm() + " to roll out refreshed endpoint certificate"); + eligibleJobs.add(new EligibleJob(deployment, applicationId, job)); } })); } })); + + eligibleJobs.stream() + .min(oldestFirst) + .ifPresent(e -> { + deploymentTrigger.reTrigger(e.applicationId, e.job, "re-triggered by EndpointCertificateMaintainer"); + log.info("Re-triggering deployment job " + e.job.jobName() + " for instance " + + e.applicationId.serializedForm() + " to roll out refreshed endpoint certificate"); + }); } private OptionalInt latestVersionInSecretStore(EndpointCertificateMetadata originalCertificateMetadata) { @@ -156,8 +168,8 @@ public class EndpointCertificateMaintainer extends ControllerMaintainer { List<EndpointCertificateRequestMetadata> endpointCertificateMetadata = endpointCertificateProvider.listCertificates(); Map<ApplicationId, EndpointCertificateMetadata> storedEndpointCertificateMetadata = curator.readAllEndpointCertificateMetadata(); - List<String> leafRequestIds = storedEndpointCertificateMetadata.values().stream().flatMap(m -> m.leafRequestId().stream()).collect(Collectors.toList()); - List<String> rootRequestIds = storedEndpointCertificateMetadata.values().stream().map(EndpointCertificateMetadata::rootRequestId).collect(Collectors.toList()); + List<String> leafRequestIds = storedEndpointCertificateMetadata.values().stream().flatMap(m -> m.leafRequestId().stream()).toList(); + List<String> rootRequestIds = storedEndpointCertificateMetadata.values().stream().map(EndpointCertificateMetadata::rootRequestId).toList(); for (var providerCertificateMetadata : endpointCertificateMetadata) { if (!rootRequestIds.contains(providerCertificateMetadata.requestId()) && !leafRequestIds.contains(providerCertificateMetadata.requestId())) { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/billing/BillingApiHandlerV2.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/billing/BillingApiHandlerV2.java index 44a8b636ae0..4532e0c2c18 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/billing/BillingApiHandlerV2.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/billing/BillingApiHandlerV2.java @@ -14,6 +14,7 @@ import com.yahoo.slime.Cursor; import com.yahoo.slime.Inspector; import com.yahoo.slime.Slime; import com.yahoo.slime.Type; +import com.yahoo.vespa.hosted.controller.Application; import com.yahoo.vespa.hosted.controller.ApplicationController; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.TenantController; @@ -23,8 +24,10 @@ import com.yahoo.vespa.hosted.controller.api.integration.billing.CollectionMetho import com.yahoo.vespa.hosted.controller.api.integration.billing.Plan; import com.yahoo.vespa.hosted.controller.api.integration.billing.PlanId; import com.yahoo.vespa.hosted.controller.api.integration.billing.PlanRegistry; +import com.yahoo.vespa.hosted.controller.api.integration.billing.Quota; import com.yahoo.vespa.hosted.controller.api.role.Role; import com.yahoo.vespa.hosted.controller.api.role.SecurityContext; +import com.yahoo.vespa.hosted.controller.application.QuotaUsage; import com.yahoo.vespa.hosted.controller.tenant.CloudTenant; import com.yahoo.vespa.hosted.controller.tenant.Tenant; @@ -200,11 +203,13 @@ public class BillingApiHandlerV2 extends RestApiRequestHandler<BillingApiHandler var response = new Slime(); var tenantsResponse = response.setObject().setArray("tenants"); + tenants.asList().stream().sorted(Comparator.comparing(Tenant::name)).forEach(tenant -> { var usage = Optional.ofNullable(usagePerTenant.get(tenant.name())); var tenantResponse = tenantsResponse.addObject(); tenantResponse.setString("tenant", tenant.name().value()); toSlime(tenantResponse.setObject("plan"), planFor(tenant.name())); + toSlime(tenantResponse.setObject("quota"), billing.getQuota(tenant.name())); tenantResponse.setString("collection", billing.getCollectionMethod(tenant.name()).name()); tenantResponse.setString("lastBill", usage.map(Bill::getStartDate).map(DateTimeFormatter.ISO_DATE::format).orElse(null)); tenantResponse.setString("unbilled", usage.map(Bill::sum).map(BigDecimal::toPlainString).orElse("0.00")); @@ -357,6 +362,10 @@ public class BillingApiHandlerV2 extends RestApiRequestHandler<BillingApiHandler cursor.setString("name", plan.displayName()); } + private void toSlime(Cursor cursor, Quota quota) { + cursor.setDouble("budget", quota.budget().map(BigDecimal::doubleValue).orElse(-1.0)); + } + private Plan planFor(TenantName tenant) { var planId = billing.getPlan(tenant); return planRegistry.plan(planId) diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainerTest.java index c103894b1a3..47a1b44d196 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/EndpointCertificateMaintainerTest.java @@ -1,19 +1,32 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.maintenance; +import com.yahoo.component.Version; import com.yahoo.config.provision.ApplicationId; +import com.yahoo.config.provision.zone.ZoneId; import com.yahoo.vespa.hosted.controller.ControllerTester; import com.yahoo.vespa.hosted.controller.api.integration.certificates.EndpointCertificateMetadata; import com.yahoo.vespa.hosted.controller.api.integration.certificates.EndpointCertificateMock; +import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType; +import com.yahoo.vespa.hosted.controller.api.integration.deployment.RevisionId; +import com.yahoo.vespa.hosted.controller.application.Deployment; +import com.yahoo.vespa.hosted.controller.application.DeploymentActivity; +import com.yahoo.vespa.hosted.controller.application.DeploymentMetrics; +import com.yahoo.vespa.hosted.controller.application.QuotaUsage; import com.yahoo.vespa.hosted.controller.deployment.ApplicationPackageBuilder; import com.yahoo.vespa.hosted.controller.deployment.DeploymentContext; import com.yahoo.vespa.hosted.controller.deployment.DeploymentTester; import com.yahoo.vespa.hosted.controller.integration.SecretStoreMock; +import org.jetbrains.annotations.NotNull; import org.junit.jupiter.api.Test; import java.time.Duration; +import java.time.Instant; +import java.time.temporal.ChronoUnit; import java.util.List; import java.util.Optional; +import java.util.OptionalDouble; +import java.util.stream.Stream; import static com.yahoo.vespa.hosted.controller.deployment.DeploymentContext.productionUsWest1; import static com.yahoo.vespa.hosted.controller.deployment.DeploymentContext.stagingTest; @@ -121,6 +134,21 @@ public class EndpointCertificateMaintainerTest { } @Test + void testEligibleSorting() { + EndpointCertificateMaintainer.EligibleJob oldestDeployment = makeDeploymentAtAge(5); + assertEquals( + oldestDeployment, + Stream.of(makeDeploymentAtAge(2), oldestDeployment, makeDeploymentAtAge(4)).min(maintainer.oldestFirst).get()); + } + + @NotNull + private EndpointCertificateMaintainer.EligibleJob makeDeploymentAtAge(int ageInDays) { + var deployment = new Deployment(ZoneId.defaultId(), RevisionId.forProduction(1), Version.emptyVersion, + Instant.now().minus(ageInDays, ChronoUnit.DAYS), DeploymentMetrics.none, DeploymentActivity.none, QuotaUsage.none, OptionalDouble.empty()); + return new EndpointCertificateMaintainer.EligibleJob(deployment, ApplicationId.defaultId(), JobType.prod("somewhere")); + } + + @Test void unmaintained_cert_is_deleted() { EndpointCertificateMock endpointCertificateProvider = (EndpointCertificateMock) tester.controller().serviceRegistry().endpointCertificateProvider(); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/billing/BillingApiHandlerV2Test.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/billing/BillingApiHandlerV2Test.java index 7c3f0e51b92..c62a9f1399f 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/billing/BillingApiHandlerV2Test.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/billing/BillingApiHandlerV2Test.java @@ -124,7 +124,7 @@ public class BillingApiHandlerV2Test extends ControllerContainerCloudTest { var accountantRequest = request("/billing/v2/accountant").roles(Role.hostedAccountant()); tester.assertResponse(accountantRequest, """ - {"tenants":[{"tenant":"tenant1","plan":{"id":"trial","name":"Free Trial - for testing purposes"},"collection":"AUTO","lastBill":null,"unbilled":"0.00"}]}"""); + {"tenants":[{"tenant":"tenant1","plan":{"id":"trial","name":"Free Trial - for testing purposes"},"quota":{"budget":-1.0},"collection":"AUTO","lastBill":null,"unbilled":"0.00"}]}"""); } @Test diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/ConnectionException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/ConnectionException.java index da93083a6de..c78d60e9950 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/ConnectionException.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/ConnectionException.java @@ -15,7 +15,7 @@ import java.net.SocketTimeoutException; public class ConnectionException extends ConvergenceException { private ConnectionException(String message, Throwable cause) { - super(message, cause); + super(message, cause, true); } /** diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/HttpException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/HttpException.java index 4de7582fb7b..53e15b6c647 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/HttpException.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/HttpException.java @@ -14,20 +14,15 @@ public class HttpException extends ConvergenceException { private final boolean isRetryable; private HttpException(int statusCode, String message, boolean isRetryable) { - super("HTTP status code " + statusCode + ": " + message); + super("HTTP status code " + statusCode + ": " + message, null, !isRetryable); this.isRetryable = isRetryable; } private HttpException(Response.Status status, String message, boolean isRetryable) { - super(status.toString() + " (" + status.getStatusCode() + "): " + message); + super(status.toString() + " (" + status.getStatusCode() + "): " + message, null, !isRetryable); this.isRetryable = isRetryable; } - private HttpException(String message) { - super(message); - this.isRetryable = false; - } - boolean isRetryable() { return isRetryable; } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/noderepository/NodeRepositoryException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/noderepository/NodeRepositoryException.java index a0b0499bb1e..636813a2169 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/noderepository/NodeRepositoryException.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/noderepository/NodeRepositoryException.java @@ -5,10 +5,6 @@ import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException; public class NodeRepositoryException extends ConvergenceException { public NodeRepositoryException(String message) { - super(message); - } - - public NodeRepositoryException(String message, Exception exception) { - super(message, exception); + super(message, null, true); } } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorException.java index 9b8a749c33d..917b65b606c 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorException.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorException.java @@ -5,7 +5,12 @@ import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException; @SuppressWarnings("serial") public class OrchestratorException extends ConvergenceException { + /** Creates a transient convergence exception. */ public OrchestratorException(String message) { - super(message); + this(message, true); + } + + protected OrchestratorException(String message, boolean isError) { + super(message, null, isError); } } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java index 281d43e0afc..858bce27ed8 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorImpl.java @@ -59,7 +59,7 @@ public class OrchestratorImpl implements Orchestrator { } catch (HttpException e) { throw new OrchestratorException("Failed to suspend " + hostName + ": " + e.toString()); } catch (ConnectionException e) { - throw new ConvergenceException("Failed to suspend " + hostName + ": " + e.getMessage()); + throw ConvergenceException.ofTransient("Failed to suspend " + hostName + ": " + e.getMessage()); } catch (RuntimeException e) { throw new RuntimeException("Got error on suspend", e); } @@ -105,7 +105,7 @@ public class OrchestratorImpl implements Orchestrator { } catch (HttpException e) { throw new OrchestratorException("Failed to batch suspend for " + parentHostName + ": " + e.toString()); } catch (ConnectionException e) { - throw new ConvergenceException("Failed to batch suspend for " + parentHostName + ": " + e.getMessage()); + throw ConvergenceException.ofTransient("Failed to batch suspend for " + parentHostName + ": " + e.getMessage()); } catch (RuntimeException e) { throw new RuntimeException("Got error on batch suspend for " + parentHostName + ", with nodes " + hostNames, e); } @@ -126,7 +126,7 @@ public class OrchestratorImpl implements Orchestrator { } catch (HttpException e) { throw new OrchestratorException("Failed to resume " + hostName + ": " + e.toString()); } catch (ConnectionException e) { - throw new ConvergenceException("Failed to resume " + hostName + ": " + e.getMessage()); + throw ConvergenceException.ofTransient("Failed to resume " + hostName + ": " + e.getMessage()); } catch (RuntimeException e) { throw new RuntimeException("Got error on resume", e); } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorNotFoundException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorNotFoundException.java index 3b016aac03f..a6a54807e56 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorNotFoundException.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/orchestrator/OrchestratorNotFoundException.java @@ -4,6 +4,6 @@ package com.yahoo.vespa.hosted.node.admin.configserver.orchestrator; @SuppressWarnings("serial") public class OrchestratorNotFoundException extends OrchestratorException { public OrchestratorNotFoundException(String message) { - super(message); + super(message, true); } } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java index c893f7ffee4..062a7ce018d 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java @@ -120,7 +120,7 @@ public class StorageMaintainer { String[] results = output.split("\t"); if (results.length != 2) { - throw new ConvergenceException("Result from disk usage command not as expected: " + output); + throw ConvergenceException.ofError("Result from disk usage command not as expected: " + output); } return DiskSize.of(Long.parseLong(results[0]), DiskSize.Unit.kiB); @@ -226,11 +226,11 @@ public class StorageMaintainer { String output = uncheck(() -> Files.readAllLines(Paths.get("/proc/cpuinfo")).stream() .filter(line -> line.startsWith("microcode")) .findFirst() - .orElseThrow(() -> new ConvergenceException("No microcode information found in /proc/cpuinfo"))); + .orElseThrow(() -> ConvergenceException.ofError("No microcode information found in /proc/cpuinfo"))); String[] results = output.split(":"); if (results.length != 2) { - throw new ConvergenceException("Result from detect microcode command not as expected: " + output); + throw ConvergenceException.ofError("Result from detect microcode command not as expected: " + output); } return results[1].trim(); diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java index 45dbfd07209..0933f22dee3 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java @@ -51,7 +51,7 @@ public class CoreCollector { Matcher matcher = CORE_GENERATOR_PATH_PATTERN.matcher(result.getOutput()); if (! matcher.find()) { - throw new ConvergenceException(String.format("Failed to extract binary path from GDB, result: %s, command: %s", + throw ConvergenceException.ofError(String.format("Failed to extract binary path from GDB, result: %s, command: %s", asString(result), Arrays.toString(wrappedCommand))); } return matcher.group("path").split(" ")[0]; @@ -62,7 +62,7 @@ public class CoreCollector { try { CommandResult result = container.executeCommandInContainer(context, context.users().root(), command); if (result.getExitCode() != 0) { - throw new ConvergenceException("file command failed with " + asString(result)); + throw ConvergenceException.ofError("file command failed with " + asString(result)); } Matcher execfnMatcher = EXECFN_PATH_PATTERN.matcher(result.getOutput()); @@ -89,7 +89,7 @@ public class CoreCollector { CommandResult result = container.executeCommandInContainer(context, context.users().root(), command); if (result.getExitCode() != 0) - throw new ConvergenceException("Failed to read backtrace " + asString(result) + ", Command: " + Arrays.toString(command)); + throw ConvergenceException.ofError("Failed to read backtrace " + asString(result) + ", Command: " + Arrays.toString(command)); return List.of(result.getOutput().split("\n")); } @@ -99,7 +99,7 @@ public class CoreCollector { CommandResult result = container.executeCommandInContainer(context, context.users().root(), command); if (result.getExitCode() != 0) - throw new ConvergenceException("Failed to read jstack " + asString(result) + ", Command: " + Arrays.toString(command)); + throw ConvergenceException.ofError("Failed to read jstack " + asString(result) + ", Command: " + Arrays.toString(command)); return List.of(result.getOutput().split("\n")); } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java index 6295765a95f..ece494a34d7 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java @@ -95,7 +95,7 @@ public class CoredumpHandler { .map(FileFinder.FileAttributes::filename) .toList(); if (!pendingCores.isEmpty()) - throw new ConvergenceException(String.format("Cannot process %s coredumps: Still being written", + throw ConvergenceException.ofError(String.format("Cannot process %s coredumps: Still being written", pendingCores.size() < 5 ? pendingCores : pendingCores.size())); } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ConvergenceException.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ConvergenceException.java index 16a5eb022ad..c1b86fc7fe2 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ConvergenceException.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ConvergenceException.java @@ -8,11 +8,34 @@ package com.yahoo.vespa.hosted.node.admin.nodeadmin; */ @SuppressWarnings("serial") public class ConvergenceException extends RuntimeException { - public ConvergenceException(String message) { - super(message); + /** Create an exception that will NOT increment the monitored unhandled_exceptions metric. */ + public static ConvergenceException ofTransient(String message) { return ofTransient(message, null); } + + /** Create an exception that will NOT increment the monitored unhandled_exceptions metric. */ + public static ConvergenceException ofTransient(String message, Throwable t) { return new ConvergenceException(message, t, false); } + + /** Create an exception that increments the monitored unhandled_exceptions metric. */ + public static ConvergenceException ofError(String message) { return ofError(message, null); } + + /** Create an exception that increments the monitored unhandled_exceptions metric. */ + public static ConvergenceException ofError(String message, Throwable t) { return new ConvergenceException(message, t, true); } + + /** Create an exception with the same transient/error as the cause. */ + public static ConvergenceException ofNested(String message, ConvergenceException cause) { return new ConvergenceException(message, cause, cause.isError); } + + private final boolean isError; + + /** @param isError whether the exception should increment the monitored unhandled_exception metric. */ + protected ConvergenceException(String message, boolean isError) { + this(message, null, isError); } - public ConvergenceException(String message, Throwable t) { + /** @param isError whether the exception should increment the monitored unhandled_exception metric. */ + protected ConvergenceException(String message, Throwable t, boolean isError) { super(message, t); + this.isError = isError; } + + /** Whether the exception signals an error someone may want to look at, or whether it is expected to be transient (false). */ + public boolean isError() { return isError; } } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java index dda404797d9..314844dc6eb 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java @@ -110,7 +110,7 @@ public class NodeAdminStateUpdater { if (hostIsActiveInNR) orchestrator.resume(hostHostname); - throw new ConvergenceException("Timed out trying to freeze all nodes: will force an unfrozen tick"); + throw ConvergenceException.ofTransient("Timed out trying to freeze all nodes: will force an unfrozen tick"); } boolean wantFrozen = wantedState != RESUMED; @@ -118,7 +118,7 @@ public class NodeAdminStateUpdater { currentState = TRANSITIONING; if (!nodeAdmin.setFrozen(wantFrozen)) - throw new ConvergenceException("NodeAdmin is not yet " + (wantFrozen ? "frozen" : "unfrozen")); + throw ConvergenceException.ofTransient("NodeAdmin is not yet " + (wantFrozen ? "frozen" : "unfrozen")); switch (wantedState) { case RESUMED: diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 09bc58bdaa2..6408132fe0b 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -244,7 +244,7 @@ public class NodeAgentImpl implements NodeAgent { hasResumedNode = false; context.log(logger, "Container successfully started, new containerState is " + containerState); return containerOperations.getContainer(context).orElseThrow(() -> - new ConvergenceException("Did not find container that was just started")); + ConvergenceException.ofError("Did not find container that was just started")); } private Optional<Container> removeContainerIfNeededUpdateContainerState( @@ -399,7 +399,7 @@ public class NodeAgentImpl implements NodeAgent { // Only update CPU resources containerOperations.updateContainer(context, existingContainer.id(), wantedContainerResources.withMemoryBytes(existingContainer.resources().memoryBytes())); return containerOperations.getContainer(context).orElseThrow(() -> - new ConvergenceException("Did not find container that was just updated")); + ConvergenceException.ofError("Did not find container that was just updated")); } private ContainerResources getContainerResources(NodeAgentContext context) { @@ -435,6 +435,8 @@ public class NodeAgentImpl implements NodeAgent { context.log(logger, Level.INFO, "Converged"); } catch (ConvergenceException e) { context.log(logger, e.getMessage()); + if (e.isError()) + numberOfUnhandledException++; } catch (Throwable e) { numberOfUnhandledException++; context.log(logger, Level.SEVERE, "Unhandled exception, ignoring", e); @@ -501,7 +503,7 @@ public class NodeAgentImpl implements NodeAgent { Duration timeLeft = Duration.between(clock.instant(), firstSuccessfulHealthCheckInstant.get().plus(warmUpDuration(context))); if (!container.get().resources().equalsCpu(getContainerResources(context))) - throw new ConvergenceException("Refusing to resume until warm up period ends (" + + throw ConvergenceException.ofTransient("Refusing to resume until warm up period ends (" + (timeLeft.isNegative() ? "next tick" : "in " + timeLeft) + ")"); } serviceDumper.processServiceDumpRequest(context); @@ -536,7 +538,7 @@ public class NodeAgentImpl implements NodeAgent { nodeRepository.setNodeState(context.hostname().value(), NodeState.ready); break; default: - throw new ConvergenceException("UNKNOWN STATE " + node.state().name()); + throw ConvergenceException.ofError("UNKNOWN STATE " + node.state().name()); } } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/network/IPAddresses.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/network/IPAddresses.java index 07e172c5117..148d80c9803 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/network/IPAddresses.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/network/IPAddresses.java @@ -67,7 +67,7 @@ public interface IPAddresses { if (ipv6addresses.size() <= 1) return ipv6addresses.stream().findFirst(); String addresses = ipv6addresses.stream().map(InetAddresses::toAddrString).collect(Collectors.joining(",")); - throw new ConvergenceException( + throw ConvergenceException.ofError( String.format( "Multiple IPv6 addresses found: %s. Perhaps a missing DNS entry or multiple AAAA records in DNS?", addresses)); @@ -103,7 +103,7 @@ public interface IPAddresses { if (siteLocalIPv4Addresses.size() == 1) return Optional.of(siteLocalIPv4Addresses.get(0)); String addresses = ipv4Addresses.stream().map(InetAddresses::toAddrString).collect(Collectors.joining(",")); - throw new ConvergenceException( + throw ConvergenceException.ofError( String.format( "Multiple IPv4 addresses found: %s. Perhaps a missing DNS entry or multiple A records in DNS?", addresses)); diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index b13a3a184e5..e4c2f595164 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -334,7 +334,7 @@ public class NodeAgentImplTest { when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); when(containerOperations.pullImageAsyncIfNeeded(any(), eq(dockerImage), any())).thenReturn(false); - doThrow(new ConvergenceException("Connection refused")).doNothing() + doThrow(ConvergenceException.ofTransient("Connection refused")).doNothing() .when(healthChecker).verifyHealth(eq(context)); try { @@ -640,7 +640,7 @@ public class NodeAgentImplTest { InOrder inOrder = inOrder(orchestrator, containerOperations); - ConvergenceException healthCheckException = new ConvergenceException("Not yet up"); + ConvergenceException healthCheckException = ConvergenceException.ofTransient("Not yet up"); doThrow(healthCheckException).when(healthChecker).verifyHealth(any()); for (int i = 0; i < 3; i++) { try { |