diff options
author | Harald Musum <musum@yahooinc.com> | 2023-05-02 12:57:30 +0200 |
---|---|---|
committer | Harald Musum <musum@yahooinc.com> | 2023-05-02 12:57:30 +0200 |
commit | ea7eb3243474738236bbdcadfc4ecccfa2b6dcea (patch) | |
tree | 6cb104cbe3a4660a4757b7e2bf53ecc4910145ed /configserver | |
parent | c6968d5c84407bc81b91ace17240668c4a222e2c (diff) |
Add some more details when config is not converging
Diffstat (limited to 'configserver')
3 files changed, 112 insertions, 25 deletions
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java b/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java index 66a5bc5a023..eee9483cd69 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java @@ -18,8 +18,6 @@ import com.yahoo.vespa.config.server.ApplicationRepository; import com.yahoo.vespa.config.server.ApplicationRepository.ActionTimer; import com.yahoo.vespa.config.server.ApplicationRepository.Activation; import com.yahoo.vespa.config.server.TimeoutBudget; -import com.yahoo.vespa.config.server.application.Application; -import com.yahoo.vespa.config.server.application.ConfigConvergenceChecker; import com.yahoo.vespa.config.server.application.ConfigNotConvergedException; import com.yahoo.vespa.config.server.configchange.ConfigChangeActions; import com.yahoo.vespa.config.server.configchange.ReindexActions; @@ -188,28 +186,44 @@ public class Deployment implements com.yahoo.config.provision.Deployment { private void waitForConfigToConverge(ApplicationId applicationId) { deployLogger.log(Level.INFO, "Wait for all services to use new config generation before restarting"); - while (true) { - try { - params.get().getTimeoutBudget().assertNotTimedOut( - () -> "Timeout exceeded while waiting for config convergence for " + applicationId); - } catch (UncheckedTimeoutException e) { - throw new ConfigNotConvergedException(e); - } + var convergenceChecker = applicationRepository.configConvergenceChecker(); + var app = applicationRepository.getActiveApplication(applicationId); - ConfigConvergenceChecker convergenceChecker = applicationRepository.configConvergenceChecker(); - Application app = applicationRepository.getActiveApplication(applicationId); - ServiceListResponse response = convergenceChecker.checkConvergenceUnlessDeferringChangesUntilRestart(app); + ServiceListResponse response = null; + while (timeLeft(applicationId, response)) { + response = convergenceChecker.checkConvergenceUnlessDeferringChangesUntilRestart(app); if (response.converged) { deployLogger.log(Level.INFO, "Services converged on new config generation " + response.currentGeneration); return; } else { - deployLogger.log(Level.INFO, "Services did not converge on new config generation " + - response.wantedGeneration + ", current generation: " + response.currentGeneration + ", will retry"); + deployLogger.log(Level.INFO, "Services that did not converge on new config generation " + + response.wantedGeneration + ": " + + servicesNotConvergedFormatted(response) + ". Will retry"); try { Thread.sleep(5_000); } catch (InterruptedException e) { /* ignore */ } } } } + private boolean timeLeft(ApplicationId applicationId, ServiceListResponse response) { + try { + params.get().getTimeoutBudget().assertNotTimedOut( + () -> "Timeout exceeded while waiting for config convergence for " + applicationId + + ", wanted generation " + response.wantedGeneration + ", these services had another generation: " + + servicesNotConvergedFormatted(response)); + } catch (UncheckedTimeoutException e) { + throw new ConfigNotConvergedException(e); + } + return true; + } + + private String servicesNotConvergedFormatted(ServiceListResponse response) { + return response.services().stream() + .filter(service -> service.currentGeneration != response.wantedGeneration) + .map(service -> service.serviceInfo.getHostName() + ":" + service.serviceInfo.getServiceName() + + " on generation " + service.currentGeneration) + .collect(Collectors.joining(",")); + } + private void storeReindexing(ApplicationId applicationId, long requiredSession) { applicationRepository.modifyReindexing(applicationId, reindexing -> { if (configChangeActions != null) diff --git a/configserver/src/test/java/com/yahoo/vespa/config/server/MockConfigConvergenceChecker.java b/configserver/src/test/java/com/yahoo/vespa/config/server/MockConfigConvergenceChecker.java index b4892caa05f..f335e978235 100644 --- a/configserver/src/test/java/com/yahoo/vespa/config/server/MockConfigConvergenceChecker.java +++ b/configserver/src/test/java/com/yahoo/vespa/config/server/MockConfigConvergenceChecker.java @@ -5,14 +5,26 @@ import com.yahoo.vespa.config.server.application.Application; import com.yahoo.vespa.config.server.application.ConfigConvergenceChecker; import java.time.Duration; +import java.util.HashMap; +import java.util.List; import java.util.Map; +import java.util.function.Function; +import java.util.stream.Collectors; public class MockConfigConvergenceChecker extends ConfigConvergenceChecker { private final long wantedGeneration; + private final List<ServiceInfo> servicesThatFailFirstIteration; + + private int iteration = 0; public MockConfigConvergenceChecker(long wantedGeneration) { + this(wantedGeneration, List.of()); + } + + public MockConfigConvergenceChecker(long wantedGeneration, List<ServiceInfo> servicesThatFailFirstIteration) { this.wantedGeneration = wantedGeneration; + this.servicesThatFailFirstIteration = servicesThatFailFirstIteration; } @Override @@ -32,7 +44,15 @@ public class MockConfigConvergenceChecker extends ConfigConvergenceChecker { @Override public ServiceListResponse checkConvergenceUnlessDeferringChangesUntilRestart(Application application) { - return new ServiceListResponse(Map.of(), wantedGeneration, wantedGeneration); + iteration++; + if (servicesThatFailFirstIteration.isEmpty() || iteration > 1) + return new ServiceListResponse(Map.of(), wantedGeneration, wantedGeneration); + + Map<ServiceInfo, Long> services = new HashMap<>(); + for (var service : servicesThatFailFirstIteration) { + services.put(service, wantedGeneration - 1); + } + return new ServiceListResponse(services, wantedGeneration, wantedGeneration - 1); } } diff --git a/configserver/src/test/java/com/yahoo/vespa/config/server/deploy/HostedDeployTest.java b/configserver/src/test/java/com/yahoo/vespa/config/server/deploy/HostedDeployTest.java index 842416615e2..f170e232619 100644 --- a/configserver/src/test/java/com/yahoo/vespa/config/server/deploy/HostedDeployTest.java +++ b/configserver/src/test/java/com/yahoo/vespa/config/server/deploy/HostedDeployTest.java @@ -22,9 +22,11 @@ import com.yahoo.config.provision.DockerImage; import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.RegionName; import com.yahoo.config.provision.Zone; +import com.yahoo.slime.SlimeUtils; import com.yahoo.test.ManualClock; import com.yahoo.vespa.config.server.MockConfigConvergenceChecker; import com.yahoo.vespa.config.server.application.ApplicationReindexing; +import com.yahoo.vespa.config.server.application.ConfigConvergenceChecker; import com.yahoo.vespa.config.server.http.InternalServerException; import com.yahoo.vespa.config.server.http.InvalidApplicationException; import com.yahoo.vespa.config.server.http.UnknownVespaVersionException; @@ -42,7 +44,6 @@ import java.time.Clock; import java.time.Duration; import java.time.Instant; import java.util.Collections; -import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; @@ -445,8 +446,7 @@ public class HostedDeployTest { @Test public void testThatConfigChangeActionsAreCollectedFromAllModels() { List<Host> hosts = createHosts(9, "6.1.0", "6.2.0"); - List<ServiceInfo> services = List.of( - new ServiceInfo("serviceName", "serviceType", null, new HashMap<>(), "configId", "hostName")); + List<ServiceInfo> services = createServices(); List<ModelFactory> modelFactories = List.of( new ConfigChangeActionsModelFactory(Version.fromString("6.1.0"), @@ -461,10 +461,41 @@ public class HostedDeployTest { } @Test + public void testConfigConvergenceBeforeRestart() { + List<Host> hosts = createHosts(9, "6.1.0", "6.2.0"); + List<ServiceInfo> services = createServices(); + + List<ModelFactory> modelFactories = List.of( + new ConfigChangeActionsModelFactory(Version.fromString("6.1.0"), + new VespaRestartAction(ClusterSpec.Id.from("test"), "change", services)), + new ConfigChangeActionsModelFactory(Version.fromString("6.2.0"), + new VespaRestartAction(ClusterSpec.Id.from("test"), "other change", services))); + + DeployTester tester = createTester(hosts, + modelFactories, + prodZone, + Clock.systemUTC(), + new MockConfigConvergenceChecker(2L, services)); + var result = tester.deployApp("src/test/apps/hosted/", "6.2.0"); + DeployHandlerLogger deployLogger = result.deployLogger(); + + System.out.println(SlimeUtils.toJson(deployLogger.slime().get())); + assertLogContainsMessage(deployLogger, "Wait for all services to use new config generation before restarting"); + assertLogContainsMessage(deployLogger, "Services that did not converge on new config generation 2: hostName:serviceName on generation 1,hostName2:serviceName2 on generation 1. Will retry"); + assertLogContainsMessage(deployLogger, "Services converged on new config generation 2"); + } + + private void assertLogContainsMessage(DeployHandlerLogger log, String message) { + assertEquals(1, SlimeUtils.entriesStream(log.slime().get().field("log")) + .map(entry -> entry.field("message").asString()) + .filter(m -> m.contains(message)) + .count()); + } + + @Test public void testThatAllowedConfigChangeActionsAreActedUpon() { List<Host> hosts = createHosts(9, "6.1.0"); - List<ServiceInfo> services = List.of( - new ServiceInfo("serviceName", "serviceType", null, Map.of("clustername", "cluster"), "configId", "hostName")); + List<ServiceInfo> services = createServices(); ManualClock clock = new ManualClock(Instant.EPOCH); List<ModelFactory> modelFactories = List.of( @@ -539,18 +570,40 @@ public class HostedDeployTest { return createTester(hosts, modelFactories, zone, Clock.systemUTC()); } - private DeployTester createTester(List<Host> hosts, List<ModelFactory> modelFactories, - Zone prodZone, Clock clock) { + private DeployTester createTester(List<Host> hosts, List<ModelFactory> modelFactories, Zone zone, Clock clock) { + return createTester(hosts, modelFactories, zone, clock, new MockConfigConvergenceChecker(2)); + } + + private DeployTester createTester(List<Host> hosts, + List<ModelFactory> modelFactories, + Zone zone, + Clock clock, + ConfigConvergenceChecker configConvergenceChecker) { return new DeployTester.Builder(temporaryFolder) .modelFactories(modelFactories) .clock(clock) - .zone(prodZone) + .zone(zone) .hostProvisioner(new InMemoryProvisioner(new Hosts(hosts), true, false)) - .configConvergenceChecker(new MockConfigConvergenceChecker(2)) - .hostedConfigserverConfig(prodZone) + .configConvergenceChecker(configConvergenceChecker) + .hostedConfigserverConfig(zone) .build(); } + private static List<ServiceInfo> createServices() { + return List.of(new ServiceInfo("serviceName", + "serviceType", + null, + Map.of("clustername", "cluster"), + "configId", + "hostName"), + new ServiceInfo("serviceName2", + "serviceType2", + null, + Map.of("clustername", "cluster"), + "configId", + "hostName2")); + } + private static class ConfigChangeActionsModelFactory extends TestModelFactory { private final List<ConfigChangeAction> actions; |