diff options
author | Ola Aunrønning <olaa@verizonmedia.com> | 2021-03-23 10:46:48 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-03-23 10:46:48 +0100 |
commit | 89a4e253e43c8a4ef01f34a2773edaf054655e73 (patch) | |
tree | 708aea7a01371f6d7f4878f8802ac045af9b2aa4 /controller-server | |
parent | e34cfb868220a2c0fe75f78d763f067649945b70 (diff) | |
parent | a085be5db58dfae97846ab2d9787fe5d4aec4f71 (diff) |
Merge pull request #17105 from vespa-engine/olaa/cmr-impact
Assess if impact is greater than upgrade policy
Diffstat (limited to 'controller-server')
4 files changed, 221 insertions, 57 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ChangeManagementAssessor.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ChangeManagementAssessor.java index 659edc96b77..11f36201f32 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ChangeManagementAssessor.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ChangeManagementAssessor.java @@ -4,14 +4,19 @@ package com.yahoo.vespa.hosted.controller.maintenance; import com.yahoo.config.provision.HostName; import com.yahoo.config.provision.zone.ZoneId; import com.yahoo.vespa.hosted.controller.Controller; +import com.yahoo.vespa.hosted.controller.api.integration.configserver.Cluster; +import com.yahoo.vespa.hosted.controller.api.integration.configserver.Node; import com.yahoo.vespa.hosted.controller.api.integration.configserver.NodeRepository; import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeRepositoryNode; import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeState; +import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeType; import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.Map; +import java.util.Objects; +import java.util.Optional; import java.util.stream.Collectors; public class ChangeManagementAssessor { @@ -28,10 +33,11 @@ public class ChangeManagementAssessor { Assessment assessmentInner(List<String> impactedHostnames, List<NodeRepositoryNode> allNodes, ZoneId zone) { + List<String> impactedParentHosts = toParentHosts(impactedHostnames, allNodes); // Group impacted application nodes by parent host Map<NodeRepositoryNode, List<NodeRepositoryNode>> prParentHost = allNodes.stream() .filter(nodeRepositoryNode -> nodeRepositoryNode.getState() == NodeState.active) //TODO look at more states? - .filter(node -> impactedHostnames.contains(node.getParentHostname() == null ? "" : node.getParentHostname())) + .filter(node -> impactedParentHosts.contains(node.getParentHostname() == null ? "" : node.getParentHostname())) .collect(Collectors.groupingBy(node -> allNodes.stream() .filter(parent -> parent.getHostname().equals(node.getParentHostname())) @@ -39,32 +45,31 @@ public class ChangeManagementAssessor { )); // Group nodes pr cluster - Map<String, List<NodeRepositoryNode>> prCluster = prParentHost.values() + Map<Cluster, List<NodeRepositoryNode>> prCluster = prParentHost.values() .stream() .flatMap(Collection::stream) .collect(Collectors.groupingBy(ChangeManagementAssessor::clusterKey)); boolean allHostsReplacable = nodeRepository.isReplaceable( zone, - impactedHostnames.stream() - .map(HostName::from) + prParentHost.keySet().stream() + .filter(node -> node.getType() == NodeType.host) + .map(node -> HostName.from(node.getHostname())) .collect(Collectors.toList()) ); // Report assessment pr cluster var clusterAssessments = prCluster.entrySet().stream().map((entry) -> { - String key = entry.getKey(); + Cluster cluster = entry.getKey(); List<NodeRepositoryNode> nodes = entry.getValue(); - String app = Arrays.stream(key.split(":")).limit(3).collect(Collectors.joining(":")); - String cluster = Arrays.stream(key.split(":")).skip(3).collect(Collectors.joining(":")); - long[] totalStats = clusterStats(key, allNodes); - long[] impactedStats = clusterStats(key, nodes); + long[] totalStats = clusterStats(cluster, allNodes); + long[] impactedStats = clusterStats(cluster, nodes); ClusterAssessment assessment = new ClusterAssessment(); - assessment.app = app; + assessment.app = cluster.getApp(); assessment.zone = zone.value(); - assessment.cluster = cluster; + assessment.cluster = cluster.getClusterType() + ":" + cluster.getClusterId(); assessment.clusterSize = totalStats[0]; assessment.clusterImpact = impactedStats[0]; assessment.groupsTotal = totalStats[1]; @@ -76,7 +81,7 @@ public class ChangeManagementAssessor { // TODO do some heuristic on suggestion action assessment.suggestedAction = allHostsReplacable ? "Retire all hosts" : "nothing"; // TODO do some heuristic on impact - assessment.impact = "na"; + assessment.impact = getImpact(cluster, impactedStats, totalStats); return assessment; }).collect(Collectors.toList()); @@ -99,21 +104,83 @@ public class ChangeManagementAssessor { return new Assessment(clusterAssessments, hostAssessments); } - private static String clusterKey(NodeRepositoryNode node) { - if (node.getOwner() != null && node.getMembership() != null) { - String appId = String.format("%s:%s:%s", node.getOwner().tenant, node.getOwner().application, node.getOwner().instance); - String cluster = String.format("%s:%s", node.getMembership().clustertype, node.getMembership().clusterid); - return appId + ":" + cluster; - } - return ""; + private List<String> toParentHosts(List<String> impactedHostnames, List<NodeRepositoryNode> allNodes) { + return impactedHostnames.stream() + .map(hostname -> + allNodes.stream() + .filter(node -> List.of(NodeType.config, NodeType.proxy, NodeType.host).contains(node.getType())) + .filter(node -> hostname.equals(node.getHostname()) || hostname.equals(node.getParentHostname())) + .map(node -> { + if (node.getType() == NodeType.host) + return node.getHostname(); + return node.getParentHostname(); + }).findFirst().orElseThrow() + ) + .collect(Collectors.toList()); } - private static long[] clusterStats(String key, List<NodeRepositoryNode> containerNodes) { - List<NodeRepositoryNode> clusterNodes = containerNodes.stream().filter(nodeRepositoryNode -> clusterKey(nodeRepositoryNode).equals(key)).collect(Collectors.toList()); + private static Cluster clusterKey(NodeRepositoryNode node) { + String appId = String.format("%s:%s:%s", node.getOwner().tenant, node.getOwner().application, node.getOwner().instance); + return new Cluster(Node.ClusterType.valueOf(node.getMembership().clustertype), node.getMembership().clusterid, appId, node.getType()); + } + + private static long[] clusterStats(Cluster cluster, List<NodeRepositoryNode> containerNodes) { + List<NodeRepositoryNode> clusterNodes = containerNodes.stream().filter(nodeRepositoryNode -> cluster.equals(clusterKey(nodeRepositoryNode))).collect(Collectors.toList()); long groups = clusterNodes.stream().map(nodeRepositoryNode -> nodeRepositoryNode.getMembership() != null ? nodeRepositoryNode.getMembership().group : "").distinct().count(); return new long[] { clusterNodes.size(), groups}; } + private String getImpact(Cluster cluster, long[] impactedStats, long[] totalStats) { + switch (cluster.getNodeType()) { + case tenant: + return getTenantImpact(cluster, impactedStats, totalStats); + case proxy: + return getProxyImpact(impactedStats[0], totalStats[0]); + case config: + return getConfigServerImpact(impactedStats[0]); + default: + return "Unkown impact"; + } + } + + private String getTenantImpact(Cluster cluster, long[] impactedStats, long[] totalStats) { + switch (cluster.getClusterType()) { + case container: + return getContainerImpact(impactedStats[0], totalStats[0]); + case content: + case combined: + return getContentImpact(totalStats[1] > 1, impactedStats[0], impactedStats[1]); + default: + return "Unknown impact"; + } + } + + private String getProxyImpact(long impactedNodes, long totalNodes) { + int impact = (int) (100.0 * impactedNodes / totalNodes); + return impact + "% of routing nodes impacted. Consider reprovisioning if too many"; + } + + private String getConfigServerImpact(long impactedNodes) { + if (impactedNodes == 1) { + return "Acceptable impact"; + } + return "Large impact. Consider reprovisioning one or more config servers"; + } + + private String getContainerImpact(long impactedNodes, long totalNodes) { + if ((double) impactedNodes / totalNodes <= 0.1) { + return "Impact not larger than upgrade policy"; + } + return "Impact larger than upgrade policy"; + } + + private String getContentImpact(boolean isGrouped, long impactedNodes, long impactedGroups) { + if ((isGrouped && impactedGroups == 1) || impactedNodes == 1) + return "Impact not larger than upgrade policy"; + return "Impact larger than upgrade policy"; + } + + public static class Assessment { List<ClusterAssessment> clusterAssessments; List<HostAssessment> hostAssessments; @@ -152,4 +219,49 @@ public class ChangeManagementAssessor { public int numberOfProblematicChildren; } + private static class Cluster { + private Node.ClusterType clusterType; + private String clusterId; + private String app; + private NodeType nodeType; + + public Cluster(Node.ClusterType clusterType, String clusterId, String app, NodeType nodeType) { + this.clusterType = clusterType; + this.clusterId = clusterId; + this.app = app; + this.nodeType = nodeType; + } + + public Node.ClusterType getClusterType() { + return clusterType; + } + + public String getClusterId() { + return clusterId; + } + + public String getApp() { + return app; + } + + public NodeType getNodeType() { + return nodeType; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Cluster cluster = (Cluster) o; + return Objects.equals(clusterType, cluster.clusterType) && + Objects.equals(clusterId, cluster.clusterId) && + Objects.equals(app, cluster.app); + } + + @Override + public int hashCode() { + return Objects.hash(clusterType, clusterId, app); + } + } + } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ChangeManagementAssessorTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ChangeManagementAssessorTest.java index 8a91be2b2a3..575a38cd637 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ChangeManagementAssessorTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ChangeManagementAssessorTest.java @@ -7,6 +7,7 @@ import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeMemb import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeOwner; import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeRepositoryNode; import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeState; +import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeType; import com.yahoo.vespa.hosted.controller.integration.NodeRepositoryMock; import org.junit.Test; @@ -39,16 +40,16 @@ public class ChangeManagementAssessorTest { ZoneId zone = ZoneId.from("prod", "eu-trd"); List<String> hostNames = Collections.singletonList("host1"); List<NodeRepositoryNode> allNodesInZone = new ArrayList<>(); - allNodesInZone.add(createNode("node1", "host1", "myapp", "default", 0 )); - allNodesInZone.add(createNode("node2", "host1", "myapp", "default", 0 )); - allNodesInZone.add(createNode("node3", "host1", "myapp", "default", 0 )); + allNodesInZone.add(createNode("node1", "host1", "default", 0 )); + allNodesInZone.add(createNode("node2", "host1", "default", 0 )); + allNodesInZone.add(createNode("node3", "host1", "default", 0 )); // Add an not impacted hosts - allNodesInZone.add(createNode("node4", "host2", "myapp", "default", 0 )); + allNodesInZone.add(createNode("node4", "host2", "default", 0 )); // Add tenant hosts - allNodesInZone.add(createHost("host1", "switch1")); - allNodesInZone.add(createHost("host2", "switch1")); + allNodesInZone.add(createHost("host1", NodeType.host)); + allNodesInZone.add(createHost("host2", NodeType.host)); // Make Assessment List<ChangeManagementAssessor.ClusterAssessment> assessments @@ -72,25 +73,26 @@ public class ChangeManagementAssessorTest { List<NodeRepositoryNode> allNodesInZone = new ArrayList<>(); // Two impacted nodes on host1 - allNodesInZone.add(createNode("node1", "host1", "myapp", "default", 0 )); - allNodesInZone.add(createNode("node2", "host1", "myapp", "default", 0 )); + allNodesInZone.add(createNode("node1", "host1", "default", 0 )); + allNodesInZone.add(createNode("node2", "host1", "default", 0 )); // One impacted nodes on host2 - allNodesInZone.add(createNode("node3", "host2", "myapp", "default", 0 )); + allNodesInZone.add(createNode("node3", "host2", "default", 0 )); // Another group on hosts not impacted - allNodesInZone.add(createNode("node4", "host3", "myapp", "default", 1 )); - allNodesInZone.add(createNode("node5", "host3", "myapp", "default", 1 )); - allNodesInZone.add(createNode("node6", "host3", "myapp", "default", 1 )); + allNodesInZone.add(createNode("node4", "host3", "default", 1 )); + allNodesInZone.add(createNode("node5", "host3", "default", 1 )); + allNodesInZone.add(createNode("node6", "host3", "default", 1 )); // Another cluster on hosts not impacted - this one also with three different groups (should all be ignored here) - allNodesInZone.add(createNode("node4", "host4", "myapp", "myman", 4 )); - allNodesInZone.add(createNode("node5", "host4", "myapp", "myman", 5 )); - allNodesInZone.add(createNode("node6", "host4", "myapp", "myman", 6 )); + allNodesInZone.add(createNode("node4", "host4", "myman", 4 )); + allNodesInZone.add(createNode("node5", "host4", "myman", 5 )); + allNodesInZone.add(createNode("node6", "host4", "myman", 6 )); // Add tenant hosts - allNodesInZone.add(createHost("host1", "switch1")); - allNodesInZone.add(createHost("host2", "switch1")); + allNodesInZone.add(createHost("host1", NodeType.host)); + allNodesInZone.add(createHost("host2", NodeType.host)); + // Make Assessment ChangeManagementAssessor.Assessment assessment @@ -106,6 +108,7 @@ public class ChangeManagementAssessorTest { assertEquals("content:default", clusterAssessments.get(0).cluster); assertEquals("mytenant:myapp:default", clusterAssessments.get(0).app); assertEquals("prod.eu-trd", clusterAssessments.get(0).zone); + assertEquals("Impact not larger than upgrade policy", clusterAssessments.get(0).impact); List<ChangeManagementAssessor.HostAssessment> hostAssessments = assessment.getHostAssessments(); assertEquals(2, hostAssessments.size()); @@ -117,11 +120,47 @@ public class ChangeManagementAssessorTest { )); } - private NodeOwner createOwner(String tenant, String application, String instance) { + @Test + public void two_config_nodes() { + var zone = ZoneId.from("prod", "eu-trd"); + var hostNames = Arrays.asList("config1", "config2"); + var allNodesInZone = new ArrayList<NodeRepositoryNode>(); + + // Add config nodes and parents + allNodesInZone.add(createNode("config1", "confighost1", "config", 0, NodeType.config)); + allNodesInZone.add(createHost("confighost1", NodeType.confighost)); + allNodesInZone.add(createNode("config2", "confighost2", "config", 0, NodeType.config)); + allNodesInZone.add(createHost("confighost2", NodeType.confighost)); + + var assessment = changeManagementAssessor.assessmentInner(hostNames, allNodesInZone, zone).getClusterAssessments(); + var configAssessment = assessment.get(0); + assertEquals("Large impact. Consider reprovisioning one or more config servers", configAssessment.impact); + assertEquals(2, configAssessment.clusterImpact); + } + + @Test + public void one_of_three_proxy_nodes() { + var zone = ZoneId.from("prod", "eu-trd"); + var hostNames = Arrays.asList("routing1"); + var allNodesInZone = new ArrayList<NodeRepositoryNode>(); + + // Add routing nodes and parents + allNodesInZone.add(createNode("routing1", "parentrouting1", "routing", 0, NodeType.proxy)); + allNodesInZone.add(createHost("parentrouting1", NodeType.proxyhost)); + allNodesInZone.add(createNode("routing2", "parentrouting2", "routing", 0, NodeType.proxy)); + allNodesInZone.add(createHost("parentrouting2", NodeType.proxyhost)); + allNodesInZone.add(createNode("routing3", "parentrouting3", "routing", 0, NodeType.proxy)); + allNodesInZone.add(createHost("parentrouting3", NodeType.proxyhost)); + + var assessment = changeManagementAssessor.assessmentInner(hostNames, allNodesInZone, zone).getClusterAssessments(); + assertEquals("33% of routing nodes impacted. Consider reprovisioning if too many", assessment.get(0).impact); + } + + private NodeOwner createOwner() { NodeOwner owner = new NodeOwner(); - owner.tenant = tenant; - owner.application = application; - owner.instance = instance; + owner.tenant = "mytenant"; + owner.application = "myapp"; + owner.instance = "default"; return owner; } @@ -135,21 +174,29 @@ public class ChangeManagementAssessorTest { return membership; } - private NodeRepositoryNode createNode(String nodename, String hostname, String appName, String clusterId, int group) { + private NodeRepositoryNode createNode(String nodename, String hostname, String clusterId, int group) { + return createNode(nodename, hostname, clusterId, group, NodeType.tenant); + } + + private NodeRepositoryNode createNode(String nodename, String hostname, String clusterId, int group, NodeType nodeType) { NodeRepositoryNode node = new NodeRepositoryNode(); node.setHostname(nodename); node.setParentHostname(hostname); node.setState(NodeState.active); - node.setOwner(createOwner("mytenant", appName, "default")); + node.setOwner(createOwner()); node.setMembership(createMembership(clusterId, group)); + node.setType(nodeType); return node; } - private NodeRepositoryNode createHost(String hostname, String switchName) { + private NodeRepositoryNode createHost(String hostname, NodeType nodeType) { NodeRepositoryNode node = new NodeRepositoryNode(); node.setHostname(hostname); - node.setSwitchHostname(switchName); + node.setSwitchHostname("switch1"); + node.setType(nodeType); + node.setOwner(createOwner()); + node.setMembership(createMembership(nodeType.name(), 0)); return node; } } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/changemanagement/ChangeManagementApiHandlerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/changemanagement/ChangeManagementApiHandlerTest.java index 359f34f5740..2f2e70e2cf6 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/changemanagement/ChangeManagementApiHandlerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/changemanagement/ChangeManagementApiHandlerTest.java @@ -9,6 +9,7 @@ import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeMemb import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeOwner; import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeRepositoryNode; import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeState; +import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeType; import com.yahoo.vespa.hosted.controller.restapi.ContainerTester; import com.yahoo.vespa.hosted.controller.restapi.ControllerContainerTest; import org.intellij.lang.annotations.Language; @@ -50,20 +51,20 @@ public class ChangeManagementApiHandlerTest extends ControllerContainerTest { private List<NodeRepositoryNode> createNodes() { List<NodeRepositoryNode> nodes = new ArrayList<>(); - nodes.add(createNode("node1", "host1", "myapp", "default", 0 )); - nodes.add(createNode("node2", "host1", "myapp", "default", 0 )); - nodes.add(createNode("node3", "host1", "myapp", "default", 0 )); - nodes.add(createNode("node4", "host2", "myapp", "default", 0 )); + nodes.add(createNode("node1", "host1", "default", 0 )); + nodes.add(createNode("node2", "host1", "default", 0 )); + nodes.add(createNode("node3", "host1", "default", 0 )); + nodes.add(createNode("node4", "host2", "default", 0 )); nodes.add(createHost("host1", "switch1")); nodes.add(createHost("host2", "switch2")); return nodes; } - private NodeOwner createOwner(String tenant, String application, String instance) { + private NodeOwner createOwner() { NodeOwner owner = new NodeOwner(); - owner.tenant = tenant; - owner.application = application; - owner.instance = instance; + owner.tenant = "mytenant"; + owner.application = "myapp"; + owner.instance = "default"; return owner; } @@ -77,13 +78,14 @@ public class ChangeManagementApiHandlerTest extends ControllerContainerTest { return membership; } - private NodeRepositoryNode createNode(String nodename, String hostname, String appName, String clusterId, int group) { + private NodeRepositoryNode createNode(String nodename, String hostname, String clusterId, int group) { NodeRepositoryNode node = new NodeRepositoryNode(); node.setHostname(nodename); node.setParentHostname(hostname); node.setState(NodeState.active); - node.setOwner(createOwner("mytenant", appName, "default")); + node.setOwner(createOwner()); node.setMembership(createMembership(clusterId, group)); + node.setType(NodeType.tenant); return node; } @@ -92,6 +94,9 @@ public class ChangeManagementApiHandlerTest extends ControllerContainerTest { NodeRepositoryNode node = new NodeRepositoryNode(); node.setHostname(hostname); node.setSwitchHostname(switchName); + node.setOwner(createOwner()); + node.setType(NodeType.host); + node.setMembership(createMembership("host", 0)); return node; } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/changemanagement/responses/initial.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/changemanagement/responses/initial.json index cfe98ab906b..cf349e06cff 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/changemanagement/responses/initial.json +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/changemanagement/responses/initial.json @@ -12,7 +12,7 @@ "groupsImpact": 1, "upgradePolicy": "na", "suggestedAction": "nothing", - "impact": "na" + "impact": "Impact larger than upgrade policy" } ], "hosts": [ |