diff options
author | Harald Musum <musum@yahooinc.com> | 2023-05-03 09:37:20 +0200 |
---|---|---|
committer | Harald Musum <musum@yahooinc.com> | 2023-05-03 09:37:20 +0200 |
commit | 37f2c509e4f9e33109d93737c5662f27337b3557 (patch) | |
tree | adc81a30e7d7e3fa274ee08904a7af905e1bb651 /configserver | |
parent | d0ec31ec3d0de3e4bc9e283cee079fb30eb349aa (diff) |
Only check convergence before restart for services on hosts that should be restarted
When checking config convergence for app always check all services as before
Diffstat (limited to 'configserver')
4 files changed, 51 insertions, 40 deletions
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/application/ConfigConvergenceChecker.java b/configserver/src/main/java/com/yahoo/vespa/config/server/application/ConfigConvergenceChecker.java index 132d3d913e7..040f230a40e 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/application/ConfigConvergenceChecker.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/application/ConfigConvergenceChecker.java @@ -86,19 +86,21 @@ public class ConfigConvergenceChecker extends AbstractComponent { /** Fetches the active config generation for all services in the given application. */ public Map<ServiceInfo, Long> getServiceConfigGenerations(Application application, Duration timeoutPerService) { - return getServiceConfigGenerations(application, timeoutPerService, true); + return getServiceConfigGenerations(application, timeoutPerService, new HostsToCheck(Set.of())); } /** * Fetches the active config generation for all services in the given application. Will not check services - * which defer config changes until restart if checkAll is false. + * which defer config changes until restart if checkAll is false. hostsToCheck are names to check, or empty to check all. */ - private Map<ServiceInfo, Long> getServiceConfigGenerations(Application application, Duration timeoutPerService, boolean checkAll) { + private Map<ServiceInfo, Long> getServiceConfigGenerations(Application application, + Duration timeoutPerService, + HostsToCheck hostsToCheck) { List<ServiceInfo> servicesToCheck = new ArrayList<>(); application.getModel().getHosts() .forEach(host -> host.getServices().stream() .filter(service -> serviceTypesToCheck.contains(service.getServiceType())) - .filter(serviceInfo -> shouldCheckService(checkAll, application, serviceInfo)) + .filter(serviceInfo -> shouldCheckService(hostsToCheck, application, serviceInfo)) .forEach(service -> getStatePort(service).ifPresent(port -> servicesToCheck.add(service)))); log.log(Level.FINE, "Services to check for config convergence: " + servicesToCheck); @@ -107,20 +109,20 @@ public class ConfigConvergenceChecker extends AbstractComponent { /** Checks all services in given application. Returns the minimum current generation of all services */ public ServiceListResponse checkConvergenceForAllServices(Application application, Duration timeoutPerService) { - return checkConvergence(application, timeoutPerService, true); + return checkConvergence(application, timeoutPerService, new HostsToCheck(Set.of())); } /** * Checks services except those which defer config changes until restart in the given application. * Returns the minimum current generation of those services. */ - public ServiceListResponse checkConvergenceUnlessDeferringChangesUntilRestart(Application application) { + public ServiceListResponse checkConvergenceUnlessDeferringChangesUntilRestart(Application application, Set<String> hostnames) { Duration timeoutPerService = Duration.ofSeconds(10); - return checkConvergence(application, timeoutPerService, false); + return checkConvergence(application, timeoutPerService, new HostsToCheck(hostnames)); } - private ServiceListResponse checkConvergence(Application application, Duration timeoutPerService, boolean checkAll) { - Map<ServiceInfo, Long> currentGenerations = getServiceConfigGenerations(application, timeoutPerService, checkAll); + private ServiceListResponse checkConvergence(Application application, Duration timeoutPerService, HostsToCheck hostsToCheck) { + Map<ServiceInfo, Long> currentGenerations = getServiceConfigGenerations(application, timeoutPerService, hostsToCheck); long currentGeneration = currentGenerations.values().stream().mapToLong(Long::longValue).min().orElse(-1); return new ServiceListResponse(currentGenerations, application.getApplicationGeneration(), currentGeneration); } @@ -142,8 +144,9 @@ public class ConfigConvergenceChecker extends AbstractComponent { } } - private boolean shouldCheckService(boolean checkServicesWithDeferChangesUntilRestart, Application application, ServiceInfo serviceInfo) { - if (checkServicesWithDeferChangesUntilRestart) return true; + private boolean shouldCheckService(HostsToCheck hostsToCheck, Application application, ServiceInfo serviceInfo) { + if (hostsToCheck.checkAll()) return true; + if ( ! hostsToCheck.check(serviceInfo.getHostName())) return false; if (isNotContainer(serviceInfo)) return true; return serviceIsInClusterWhichShouldBeChecked(application, serviceInfo); } @@ -307,6 +310,14 @@ public class ConfigConvergenceChecker extends AbstractComponent { .build(); } + private record HostsToCheck(Set<String> hostnames) { + + public boolean checkAll() { return hostnames.isEmpty(); } + + public boolean check(String hostname) { return checkAll() || hostnames.contains(hostname); } + + } + public static class ServiceResponse { public enum Status { ok, notFound, hostNotFound, error } diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java b/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java index e135bc7f0e2..cab3e89c606 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java @@ -173,9 +173,9 @@ public class Deployment implements com.yahoo.config.provision.Deployment { RestartActions restartActions = configChangeActions.getRestartActions().useForInternalRestart(internalRedeploy); if (restartActions.isEmpty()) return; - waitForConfigToConverge(applicationId); - Set<String> hostnames = restartActions.hostnames(); + waitForConfigToConverge(applicationId, hostnames); + provisioner.get().restart(applicationId, HostFilter.from(hostnames)); deployLogger.log(Level.INFO, String.format("Scheduled service restart of %d nodes: %s", hostnames.size(), hostnames.stream().sorted().collect(Collectors.joining(", ")))); @@ -184,14 +184,14 @@ public class Deployment implements com.yahoo.config.provision.Deployment { this.configChangeActions = configChangeActions.withRestartActions(new RestartActions()); } - private void waitForConfigToConverge(ApplicationId applicationId) { + private void waitForConfigToConverge(ApplicationId applicationId, Set<String> hostnames) { deployLogger.log(Level.INFO, "Wait for all services to use new config generation before restarting"); var convergenceChecker = applicationRepository.configConvergenceChecker(); var app = applicationRepository.getActiveApplication(applicationId); ServiceListResponse response = null; while (timeLeft(applicationId, response)) { - response = convergenceChecker.checkConvergenceUnlessDeferringChangesUntilRestart(app); + response = convergenceChecker.checkConvergenceUnlessDeferringChangesUntilRestart(app, hostnames); if (response.converged) { deployLogger.log(Level.INFO, "Services converged on new config generation " + response.currentGeneration); return; diff --git a/configserver/src/test/java/com/yahoo/vespa/config/server/MockConfigConvergenceChecker.java b/configserver/src/test/java/com/yahoo/vespa/config/server/MockConfigConvergenceChecker.java index f335e978235..ccbdcde2c2e 100644 --- a/configserver/src/test/java/com/yahoo/vespa/config/server/MockConfigConvergenceChecker.java +++ b/configserver/src/test/java/com/yahoo/vespa/config/server/MockConfigConvergenceChecker.java @@ -8,8 +8,7 @@ import java.time.Duration; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.function.Function; -import java.util.stream.Collectors; +import java.util.Set; public class MockConfigConvergenceChecker extends ConfigConvergenceChecker { @@ -43,7 +42,7 @@ public class MockConfigConvergenceChecker extends ConfigConvergenceChecker { } @Override - public ServiceListResponse checkConvergenceUnlessDeferringChangesUntilRestart(Application application) { + public ServiceListResponse checkConvergenceUnlessDeferringChangesUntilRestart(Application application, Set<String> hostnames) { iteration++; if (servicesThatFailFirstIteration.isEmpty() || iteration > 1) return new ServiceListResponse(Map.of(), wantedGeneration, wantedGeneration); diff --git a/configserver/src/test/java/com/yahoo/vespa/config/server/deploy/HostedDeployTest.java b/configserver/src/test/java/com/yahoo/vespa/config/server/deploy/HostedDeployTest.java index 58e76d5dc0a..c1da4e0502f 100644 --- a/configserver/src/test/java/com/yahoo/vespa/config/server/deploy/HostedDeployTest.java +++ b/configserver/src/test/java/com/yahoo/vespa/config/server/deploy/HostedDeployTest.java @@ -43,6 +43,7 @@ import java.nio.file.Files; import java.time.Clock; import java.time.Duration; import java.time.Instant; +import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; @@ -52,6 +53,7 @@ import java.util.stream.IntStream; import static com.yahoo.vespa.config.server.deploy.DeployTester.CountingModelFactory; import static com.yahoo.vespa.config.server.deploy.DeployTester.createFailingModelFactory; import static com.yahoo.vespa.config.server.deploy.DeployTester.createHostedModelFactory; +import static com.yahoo.yolean.Exceptions.findCause; import static com.yahoo.yolean.Exceptions.uncheck; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -446,7 +448,7 @@ public class HostedDeployTest { @Test public void testThatConfigChangeActionsAreCollectedFromAllModels() { List<Host> hosts = createHosts(9, "6.1.0", "6.2.0"); - List<ServiceInfo> services = createServices(); + List<ServiceInfo> services = createServices(1); List<ModelFactory> modelFactories = List.of( new ConfigChangeActionsModelFactory(Version.fromString("6.1.0"), @@ -463,13 +465,12 @@ public class HostedDeployTest { @Test public void testConfigConvergenceBeforeRestart() { List<Host> hosts = createHosts(9, "6.1.0", "6.2.0"); - List<ServiceInfo> services = createServices(); + List<ServiceInfo> services = createServices(1); + List<ServiceInfo> twoServices = createServices(2); List<ModelFactory> modelFactories = List.of( - new ConfigChangeActionsModelFactory(Version.fromString("6.1.0"), - new VespaRestartAction(ClusterSpec.Id.from("test"), "change", services)), new ConfigChangeActionsModelFactory(Version.fromString("6.2.0"), - new VespaRestartAction(ClusterSpec.Id.from("test"), "other change", services))); + new VespaRestartAction(ClusterSpec.Id.from("test"), "change", services))); DeployTester tester = createTester(hosts, modelFactories, @@ -479,9 +480,10 @@ public class HostedDeployTest { var result = tester.deployApp("src/test/apps/hosted/", "6.2.0"); DeployHandlerLogger deployLogger = result.deployLogger(); - System.out.println(SlimeUtils.toJson(deployLogger.slime().get())); + assertLogContainsMessage(deployLogger, "Scheduled service restart of 1 nodes: hostName0"); assertLogContainsMessage(deployLogger, "Wait for all services to use new config generation before restarting"); - assertLogContainsMessage(deployLogger, "Services that did not converge on new config generation 2: hostName:serviceName on generation 1, hostName2:serviceName2 on generation 1. Will retry"); + // Should only check convergence on 1 of the nodes + assertLogContainsMessage(deployLogger, "Services that did not converge on new config generation 2: hostName0:serviceName0 on generation 1. Will retry"); assertLogContainsMessage(deployLogger, "Services converged on new config generation 2"); } @@ -495,7 +497,7 @@ public class HostedDeployTest { @Test public void testThatAllowedConfigChangeActionsAreActedUpon() { List<Host> hosts = createHosts(9, "6.1.0"); - List<ServiceInfo> services = createServices(); + List<ServiceInfo> services = createServices(1); ManualClock clock = new ManualClock(Instant.EPOCH); List<ModelFactory> modelFactories = List.of( @@ -517,7 +519,7 @@ public class HostedDeployTest { assertEquals(9, tester.getAllocatedHostsOf(tester.applicationId()).getHosts().size()); assertTrue(prepareResult.configChangeActions().getRestartActions().isEmpty()); // Handled by deployment. assertEquals(Optional.of(ApplicationReindexing.empty() - .withPending("cluster", "music", prepareResult.sessionId())), + .withPending("cluster0", "music", prepareResult.sessionId())), tester.tenant().getApplicationRepo().database().readReindexingStatus(tester.applicationId())); } @@ -589,19 +591,18 @@ public class HostedDeployTest { .build(); } - private static List<ServiceInfo> createServices() { - return List.of(new ServiceInfo("serviceName", - "serviceType", - null, - Map.of("clustername", "cluster"), - "configId", - "hostName"), - new ServiceInfo("serviceName2", - "serviceType2", - null, - Map.of("clustername", "cluster"), - "configId", - "hostName2")); + private static List<ServiceInfo> createServices(int serviceCount) { + List<ServiceInfo> services = new ArrayList<>(); + + IntStream.range(0, serviceCount) + .forEach(i -> services.add(new ServiceInfo("serviceName" + i, + i == 0 ? "searchnode" : "container", + null, + Map.of("clustername", "cluster" + i), + "configId" + i, + "hostName" + i))); + + return services; } private static class ConfigChangeActionsModelFactory extends TestModelFactory { |