diff options
5 files changed, 89 insertions, 60 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRetirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRetirer.java index d89a4761b5d..f1981ced48e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRetirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRetirer.java @@ -1,6 +1,7 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.ApplicationId; +import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.Deployment; import com.yahoo.config.provision.Flavor; @@ -15,6 +16,7 @@ import com.yahoo.vespa.hosted.provision.provisioning.FlavorSpareChecker; import java.time.Duration; import java.util.Arrays; +import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -35,7 +37,7 @@ public class NodeRetirer extends Maintainer { public static final FlavorSpareChecker.SpareNodesPolicy SPARE_NODES_POLICY = flavorSpareCount -> flavorSpareCount.getNumReadyAmongReplacees() > 2; - private static final long MAX_SIMULTANEOUS_RETIRES_PER_APPLICATION = 1; + private static final long MAX_SIMULTANEOUS_RETIRES_PER_CLUSTER = 1; private static final Logger log = Logger.getLogger(NodeRetirer.class.getName()); private final Deployer deployer; @@ -115,16 +117,19 @@ public class NodeRetirer extends Maintainer { Map<Deployment, Set<Node>> nodesToRetireByDeployment = new HashMap<>(); for (ApplicationId applicationId : activeApplications) { List<Node> applicationNodes = getNodesBelongingToApplication(allNodes, applicationId); - Set<Node> retireableNodes = getRetireableNodesForApplication(applicationNodes); - long numNodesAllowedToRetire = getNumberNodesAllowToRetireForApplication(applicationNodes, MAX_SIMULTANEOUS_RETIRES_PER_APPLICATION); - if (retireableNodes.isEmpty() || numNodesAllowedToRetire == 0) continue; + Map<ClusterSpec, Set<Node>> retireableNodesByCluster = getRetireableNodesForApplication(applicationNodes).stream() + .collect(Collectors.groupingBy( + node -> node.allocation().get().membership().cluster(), + Collectors.toSet())); + if (retireableNodesByCluster.isEmpty()) continue; Optional<Deployment> deployment = deployer.deployFromLocalActive(applicationId, Duration.ofMinutes(30)); if ( ! deployment.isPresent()) continue; // this will be done at another config server - Set<Node> replaceableNodes = retireableNodes.stream() - .filter(node -> flavorSpareChecker.canRetireAllocatedNodeWithFlavor(node.flavor())) - .limit(numNodesAllowedToRetire) + Set<Node> replaceableNodes = retireableNodesByCluster.values().stream() + .flatMap(nodes -> nodes.stream() + .filter(node -> flavorSpareChecker.canRetireAllocatedNodeWithFlavor(node.flavor())) + .limit(getNumberNodesAllowToRetireForCluster(nodes, MAX_SIMULTANEOUS_RETIRES_PER_CLUSTER))) .collect(Collectors.toSet()); if (! replaceableNodes.isEmpty()) nodesToRetireByDeployment.put(deployment.get(), replaceableNodes); } @@ -162,7 +167,7 @@ public class NodeRetirer extends Maintainer { })); } - private List<Node> getNodesBelongingToApplication(List<Node> allNodes, ApplicationId applicationId) { + private List<Node> getNodesBelongingToApplication(Collection<Node> allNodes, ApplicationId applicationId) { return allNodes.stream() .filter(node -> node.allocation().isPresent()) .filter(node -> node.allocation().get().owner().equals(applicationId)) @@ -172,7 +177,7 @@ public class NodeRetirer extends Maintainer { /** * Returns a list of ApplicationIds sorted by number of active nodes the application has allocated to it */ - List<ApplicationId> getActiveApplicationIds(List<Node> nodes) { + List<ApplicationId> getActiveApplicationIds(Collection<Node> nodes) { return nodes.stream() .filter(node -> node.state() == Node.State.active) .collect(Collectors.groupingBy( @@ -188,7 +193,7 @@ public class NodeRetirer extends Maintainer { * @param applicationNodes All the nodes allocated to an application * @return Set of nodes that all should eventually be retired */ - Set<Node> getRetireableNodesForApplication(List<Node> applicationNodes) { + Set<Node> getRetireableNodesForApplication(Collection<Node> applicationNodes) { return applicationNodes.stream() .filter(node -> node.state() == Node.State.active) .filter(node -> !node.status().wantToRetire()) @@ -197,18 +202,18 @@ public class NodeRetirer extends Maintainer { } /** - * @param applicationNodes All the nodes allocated to an application + * @param clusterNodes All the nodes allocated to an application belonging to a single cluster * @return number of nodes we can safely start retiring */ - long getNumberNodesAllowToRetireForApplication(List<Node> applicationNodes, long maxSimultaneousRetires) { - long numNodesInWantToRetire = applicationNodes.stream() + long getNumberNodesAllowToRetireForCluster(Collection<Node> clusterNodes, long maxSimultaneousRetires) { + long numNodesInWantToRetire = clusterNodes.stream() .filter(node -> node.status().wantToRetire()) .filter(node -> node.state() != Node.State.parked) .count(); return Math.max(0, maxSimultaneousRetires - numNodesInWantToRetire); } - private Map<Flavor, Map<Node.State, Long>> getNumberOfNodesByFlavorByNodeState(List<Node> allNodes) { + private Map<Flavor, Map<Node.State, Long>> getNumberOfNodesByFlavorByNodeState(Collection<Node> allNodes) { return allNodes.stream() .collect(Collectors.groupingBy( Node::flavor, diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java index e46797240f2..d39b61f367d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java @@ -17,6 +17,7 @@ import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.stream.Collectors; /** * @author bratseth @@ -87,11 +88,39 @@ public class MockDeployer implements Deployer { public static class ApplicationContext { private final ApplicationId id; + private final List<ClusterContext> clusterContexts; + + public ApplicationContext(ApplicationId id, List<ClusterContext> clusterContexts) { + this.id = id; + this.clusterContexts = clusterContexts; + } + + public ApplicationContext(ApplicationId id, ClusterSpec cluster, Capacity capacity, int groups) { + this(id, Collections.singletonList(new ClusterContext(id, cluster, capacity, groups))); + } + + public ApplicationId id() { return id; } + + /** Returns list of cluster specs of this application. */ + public List<ClusterContext> clusterContexts() { return clusterContexts; } + + private List<HostSpec> prepare(NodeRepositoryProvisioner provisioner) { + return clusterContexts.stream() + .map(clusterContext -> clusterContext.prepare(provisioner)) + .flatMap(List::stream) + .collect(Collectors.toList()); + } + + } + + public static class ClusterContext { + + private final ApplicationId id; private final ClusterSpec cluster; private final Capacity capacity; private final int groups; - public ApplicationContext(ApplicationId id, ClusterSpec cluster, Capacity capacity, int groups) { + public ClusterContext(ApplicationId id, ClusterSpec cluster, Capacity capacity, int groups) { this.id = id; this.cluster = cluster; this.capacity = capacity; @@ -100,7 +129,6 @@ public class MockDeployer implements Deployer { public ApplicationId id() { return id; } - /** Returns the spec of the cluster of this application. Only a single cluster per application is supported */ public ClusterSpec cluster() { return cluster; } private List<HostSpec> prepare(NodeRepositoryProvisioner provisioner) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ServiceMonitorStub.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ServiceMonitorStub.java index 983f81be126..e6a50cbbc64 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ServiceMonitorStub.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ServiceMonitorStub.java @@ -77,7 +77,7 @@ public class ServiceMonitorStub implements ServiceMonitor { getHostStatus(node.hostname()))); } Set<ServiceCluster<ServiceMonitorStatus>> serviceClusters = new HashSet<>(); - serviceClusters.add(new ServiceCluster<>(new ClusterId(app.getValue().cluster().id().value()), + serviceClusters.add(new ServiceCluster<>(new ClusterId(app.getValue().clusterContexts().get(0).cluster().id().value()), new ServiceType("serviceType"), serviceInstances)); TenantId tenantId = new TenantId(app.getKey().tenant().value()); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRetirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRetirerTest.java index f71d9b75f78..b19d9360e86 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRetirerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRetirerTest.java @@ -34,24 +34,25 @@ public class NodeRetirerTest { retirer = tester.makeNodeRetirer(policy); tester.createReadyNodesByFlavor(21, 42, 27, 15, 8); - tester.deployApp("vespa", "calendar", 3, 7); - tester.deployApp("vespa", "notes", 0, 3); - tester.deployApp("sports", "results", 0, 6); - tester.deployApp("search", "images", 3, 4); - tester.deployApp("search", "videos", 2, 2); + tester.deployApp("vespa", "calendar", new int[]{3}, new int[]{7}); + tester.deployApp("vespa", "notes", new int[]{0}, new int[]{3}); + tester.deployApp("sports", "results", new int[]{0}, new int[]{6}); + tester.deployApp("search", "images", new int[]{3}, new int[]{4}); + tester.deployApp("search", "videos", new int[]{2}, new int[]{2}); + tester.deployApp("tester", "my-app", new int[]{1, 2}, new int[]{4, 6}); } @Test public void testRetireUnallocated() { - tester.assertCountsForStateByFlavor(Node.State.ready, 12, 42, 25, 4, 8); - tester.setNumberAllowedUnallocatedRetirementsPerFlavor(6, 30, 20, 2, 4); + tester.assertCountsForStateByFlavor(Node.State.ready, 12, 38, 19, 4, 8); + tester.setNumberAllowedUnallocatedRetirementsPerFlavor(6, 30, 15, 2, 4); assertFalse(retirer.retireUnallocated()); - tester.assertCountsForStateByFlavor(Node.State.parked, 6, 30, 20, 2, 4); + tester.assertCountsForStateByFlavor(Node.State.parked, 6, 30, 15, 2, 4); - tester.assertCountsForStateByFlavor(Node.State.ready, 6, 12, 5, 2, 4); + tester.assertCountsForStateByFlavor(Node.State.ready, 6, 8, 4, 2, 4); tester.setNumberAllowedUnallocatedRetirementsPerFlavor(10, 20, 5, 5, 4); assertTrue(retirer.retireUnallocated()); - tester.assertCountsForStateByFlavor(Node.State.parked, 12, 42, 25, 4, 8); + tester.assertCountsForStateByFlavor(Node.State.parked, 12, 38, 19, 4, 8); tester.nodeRepository.getNodes().forEach(node -> assertEquals(node.status().wantToDeprovision(), node.state() == Node.State.parked)); @@ -63,25 +64,27 @@ public class NodeRetirerTest { tester.nodeRepository.getNodes(Node.State.ready) .forEach(node -> tester.nodeRepository.write(node.withIpAddresses(Collections.singleton("::2")))); - tester.assertCountsForStateByFlavor(Node.State.active, 9, -1, 2, 11, -1); + tester.assertCountsForStateByFlavor(Node.State.active, 9, 4, 8, 11, -1); - tester.setNumberAllowedAllocatedRetirementsPerFlavor(3, 2, 3, 2); + tester.setNumberAllowedAllocatedRetirementsPerFlavor(3, 2, 4, 2); retirer.retireAllocated(); - tester.assertParkedCountsByApplication(-1, -1, -1, -1, -1); // Nodes should be in retired, but not yet parked + tester.assertParkedCountsByApplication(-1, -1, -1, -1, -1, -1); // Nodes should be in retired, but not yet parked tester.iterateMaintainers(); - tester.assertParkedCountsByApplication(1, 1, 1, 1, 1); + tester.assertParkedCountsByApplication(1, 1, 1, 1, 1, 2); - // We can only retire 1 more of flavor 0 and 1 more of flavor 2, app 3 is the largest that is on flavor 0 - // and app 5 is the only one on flavor 2 + // We can retire 1 more of flavor-0, 1 more of flavor-1, 2 more of flavor-2: + // app 6 has the most nodes, so it gets to retire flavor-1 and flavor-2 + // app 3 is the largest that is on flavor-0, so it gets the last node + // app 5 is gets the last node with flavor-2 retirer.retireAllocated(); tester.iterateMaintainers(); - tester.assertParkedCountsByApplication(1, 1, 2, 1, 2); + tester.assertParkedCountsByApplication(1, 1, 2, 1, 2, 4); // No more retirements are possible retirer.retireAllocated(); tester.iterateMaintainers(); - tester.assertParkedCountsByApplication(1, 1, 2, 1, 2); + tester.assertParkedCountsByApplication(1, 1, 2, 1, 2, 4); tester.nodeRepository.getNodes().forEach(node -> assertEquals(node.status().wantToDeprovision(), node.state() == Node.State.parked)); @@ -90,7 +93,7 @@ public class NodeRetirerTest { @Test public void testGetActiveApplicationIds() { List<String> expectedOrder = Arrays.asList( - "vespa.calendar", "sports.results", "search.images", "vespa.notes", "search.videos"); + "tester.my-app", "vespa.calendar", "sports.results", "search.images", "vespa.notes", "search.videos"); List<String> actualOrder = retirer.getActiveApplicationIds(tester.nodeRepository.getNodes()).stream() .map(applicationId -> applicationId.toShortString().replace(":default", "")) .collect(Collectors.toList()); @@ -121,21 +124,21 @@ public class NodeRetirerTest { } @Test - public void testGetNumberNodesAllowToRetireForApplication() { + public void testGetNumberNodesAllowToRetireForCluster() { ApplicationId app = new ApplicationId.Builder().tenant("vespa").applicationName("calendar").build(); - long actualAllActive = retirer.getNumberNodesAllowToRetireForApplication(tester.nodeRepository.getNodes(app), 2); + long actualAllActive = retirer.getNumberNodesAllowToRetireForCluster(tester.nodeRepository.getNodes(app), 2); assertEquals(2, actualAllActive); // Lets put 3 random nodes in wantToRetire List<Node> nodesToRetire = tester.nodeRepository.getNodes(app).stream().limit(3).collect(Collectors.toList()); nodesToRetire.forEach(node -> tester.nodeRepository.write(node.with(node.status().withWantToRetire(true)))); - long actualOneWantToRetire = retirer.getNumberNodesAllowToRetireForApplication(tester.nodeRepository.getNodes(app), 2); + long actualOneWantToRetire = retirer.getNumberNodesAllowToRetireForCluster(tester.nodeRepository.getNodes(app), 2); assertEquals(0, actualOneWantToRetire); // Now 2 of those finish retiring and go to parked nodesToRetire.stream().limit(2).forEach(node -> tester.nodeRepository.park(node.hostname(), Agent.system, "Parked for unit testing")); - long actualOneRetired = retirer.getNumberNodesAllowToRetireForApplication(tester.nodeRepository.getNodes(app), 2); + long actualOneRetired = retirer.getNumberNodesAllowToRetireForCluster(tester.nodeRepository.getNodes(app), 2); assertEquals(1, actualOneRetired); } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRetirerTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRetirerTester.java index c18ebd67d62..54690cda6a5 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRetirerTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRetirerTester.java @@ -8,16 +8,13 @@ import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.DockerImage; import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.Flavor; -import com.yahoo.config.provision.HostSpec; import com.yahoo.config.provision.NodeFlavors; import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.RegionName; import com.yahoo.config.provision.Zone; import com.yahoo.test.ManualClock; -import com.yahoo.transaction.NestedTransaction; import com.yahoo.vespa.curator.Curator; import com.yahoo.vespa.curator.mock.MockCurator; -import com.yahoo.vespa.curator.transaction.CuratorTransaction; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.maintenance.retire.RetirementPolicy; @@ -55,11 +52,9 @@ public class NodeRetirerTester { public final ManualClock clock = new ManualClock(); public final NodeRepository nodeRepository; private final FlavorSpareChecker flavorSpareChecker = mock(FlavorSpareChecker.class); - private final Curator curator = new MockCurator(); private final MockDeployer deployer; private final JobControl jobControl; private final List<Flavor> flavors; - private final NodeRepositoryProvisioner provisioner; // Use LinkedHashMap to keep order in which applications were deployed private final Map<ApplicationId, MockDeployer.ApplicationContext> apps = new LinkedHashMap<>(); @@ -69,10 +64,11 @@ public class NodeRetirerTester { private int nextNodeId = 0; NodeRetirerTester(NodeFlavors nodeFlavors) { + Curator curator = new MockCurator(); nodeRepository = new NodeRepository(nodeFlavors, curator, clock, zone, new MockNameResolver().mockAnyLookup(), new DockerImage("docker-registry.domain.tld:8080/dist/vespa")); jobControl = new JobControl(nodeRepository.database()); - provisioner = new NodeRepositoryProvisioner(nodeRepository, nodeFlavors, zone); + NodeRepositoryProvisioner provisioner = new NodeRepositoryProvisioner(nodeRepository, nodeFlavors, zone); deployer = new MockDeployer(provisioner, apps); flavors = nodeFlavors.getFlavors().stream().sorted(Comparator.comparing(Flavor::name)).collect(Collectors.toList()); } @@ -97,15 +93,19 @@ public class NodeRetirerTester { nodeRepository.setReady(nodes); } - void deployApp(String tenantName, String applicationName, int flavorId, int numNodes) { - Flavor flavor = flavors.get(flavorId); + void deployApp(String tenantName, String applicationName, int[] flavorIds, int[] numNodes) { + final ApplicationId applicationId = ApplicationId.from(tenantName, applicationName, "default"); + final List<MockDeployer.ClusterContext> clusterContexts = new ArrayList<>(); - ApplicationId applicationId = ApplicationId.from(tenantName, applicationName, "default"); - ClusterSpec cluster = ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("test"), Version.fromString("6.99")); - Capacity capacity = Capacity.fromNodeCount(numNodes, flavor.name()); - apps.put(applicationId, new MockDeployer.ApplicationContext(applicationId, cluster, capacity, 1)); + for (int i = 0; i < flavorIds.length; i++) { + Flavor flavor = flavors.get(flavorIds[i]); + ClusterSpec cluster = ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("cluster-" + i), Version.fromString("6.99")); + Capacity capacity = Capacity.fromNodeCount(numNodes[i], flavor.name()); + clusterContexts.add(new MockDeployer.ClusterContext(applicationId, cluster, capacity, 1)); + } - activate(applicationId, cluster, capacity); + apps.put(applicationId, new MockDeployer.ApplicationContext(applicationId, clusterContexts)); + deployer.deployFromLocalActive(applicationId, Duration.ZERO).get().activate(); } void iterateMaintainers() { @@ -121,13 +121,6 @@ public class NodeRetirerTester { inactiveExpirer.maintain(); } - private void activate(ApplicationId applicationId, ClusterSpec cluster, Capacity capacity) { - List<HostSpec> hosts = provisioner.prepare(applicationId, cluster, capacity, 1, null); - NestedTransaction transaction = new NestedTransaction().add(new CuratorTransaction(curator)); - provisioner.activate(transaction, applicationId, hosts); - transaction.commit(); - } - void setNumberAllowedUnallocatedRetirementsPerFlavor(int... numAllowed) { for (int i = 0; i < numAllowed.length; i++) { Boolean[] responses = new Boolean[numAllowed[i]]; |