diff options
author | mgimle <michael@gimle.io> | 2019-07-15 11:49:42 +0200 |
---|---|---|
committer | mgimle <michael@gimle.io> | 2019-07-15 11:58:25 +0200 |
commit | 9a98cc68d4a1f66e65c936cae9bd89b15972b661 (patch) | |
tree | 0869478ba7e2ea661c8bd26552d00eaf71a788c1 | |
parent | 2a51bf3566624e069ac14e3f072e23b31907d78f (diff) |
Added endpoint for explaining the conclusions of the host capacity checker.
6 files changed, 273 insertions, 0 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/HostCapacityResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/HostCapacityResponse.java new file mode 100644 index 00000000000..be8de0d4475 --- /dev/null +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/HostCapacityResponse.java @@ -0,0 +1,161 @@ +package com.yahoo.vespa.hosted.provision.restapi.v2; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.yahoo.container.jdisc.HttpRequest; +import com.yahoo.container.jdisc.HttpResponse; +import com.yahoo.slime.Cursor; +import com.yahoo.slime.JsonFormat; +import com.yahoo.slime.Slime; +import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.maintenance.CapacityChecker; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.Arrays; +import java.util.List; +import java.util.Optional; + +public class HostCapacityResponse extends HttpResponse { + private final StringBuilder text; + private final Slime slime; + private final CapacityChecker capacityChecker; + private final boolean json; + + public HostCapacityResponse(NodeRepository nodeRepository, HttpRequest request) { + super(200); + capacityChecker = new CapacityChecker(nodeRepository); + + json = request.getBooleanProperty("json"); + String hostsJson = request.getProperty("hosts"); + + text = new StringBuilder(); + slime = new Slime(); + Cursor root = slime.setObject(); + + if (hostsJson != null) { + ObjectMapper om = new ObjectMapper(); + String[] hostsArray; + try { + hostsArray = om.readValue(hostsJson, String[].class); + } catch (Exception e) { + throw new IllegalArgumentException(e.getMessage()); + } + List<String> hostNames = Arrays.asList(hostsArray); + List<Node> hosts; + try { + hosts = capacityChecker.nodesFromHostnames(hostNames); + } catch (IllegalArgumentException e) { + throw new NotFoundException(e.getMessage()); + } + var failure = capacityChecker.findHostRemovalFailure(hosts); + if (failure.isPresent() && failure.get().failureReason.failureReasons.size() == 0) { + root.setBool("removalPossible", false); + error(root, "Removing all hosts is trivially impossible."); + } else { + if (json) hostLossPossibleToSlime(root, failure, hosts); + else hostLossPossibleToText(failure, hosts); + } + } else { + var failurePath = capacityChecker.worstCaseHostLossLeadingToFailure(); + if (failurePath.isPresent()) { + if (json) zoneFailurePathToSlime(root, failurePath.get()); + else zoneFailurePathToText(failurePath.get()); + } else { + error(root, "Node repository contained no hosts."); + } + } + } + + private void error(Cursor root, String errorMessage) { + if (json) root.setString("error", errorMessage); + else text.append(errorMessage); + } + + private void hostLossPossibleToText(Optional<CapacityChecker.HostFailurePath> failure, List<Node> hostsToRemove) { + text.append(String.format("Attempting to remove %d hosts: ", hostsToRemove.size())); + CapacityChecker.AllocationHistory history = capacityChecker.allocationHistory; + if (failure.isEmpty()) { + text.append("OK\n\n"); + text.append(history); + if (history.oldParents().size() != hostsToRemove.size()) { + long emptyHostCount = hostsToRemove.size() - history.oldParents().size(); + text.append(String.format("\nTrivially removed %d empty host%s.", emptyHostCount, emptyHostCount > 1 ? "s" : "")); + } + } else { + text.append("FAILURE\n\n"); + text.append(history).append("\n"); + text.append(failure.get().failureReason).append("\n\n"); + } + } + + private void zoneFailurePathToText(CapacityChecker.HostFailurePath failurePath) { + text.append(String.format("Found %d hosts. Failure upon trying to remove %d hosts:\n\n", + capacityChecker.getHosts().size(), + failurePath.hostsCausingFailure.size())); + text.append(capacityChecker.allocationHistory).append("\n"); + text.append(failurePath.failureReason); + } + + private void hostLossPossibleToSlime(Cursor root, Optional<CapacityChecker.HostFailurePath> failure, List<Node> hostsToRemove) { + var hosts = root.setArray("hostsToRemove"); + hostsToRemove.forEach(h -> hosts.addString(h.hostname())); + CapacityChecker.AllocationHistory history = capacityChecker.allocationHistory; + if (failure.isEmpty()) { + root.setBool("removalPossible", true); + } else { + root.setBool("removalPossible", false); + } + var arr = root.setArray("history"); + for (var entry : history.historyEntries) { + var object = arr.addObject(); + object.setString("tenant", entry.tenant.hostname()); + if (entry.newParent != null) { + object.setString("newParent", entry.newParent.hostname()); + } + object.setLong("eligibleParents", entry.eligibleParents); + } + } + + public void zoneFailurePathToSlime(Cursor object, CapacityChecker.HostFailurePath failurePath) { + object.setLong("totalHosts", capacityChecker.getHosts().size()); + object.setLong("couldLoseHosts", failurePath.hostsCausingFailure.size()); + failurePath.failureReason.host.ifPresent(host -> + object.setString("failedTenantParent", host.hostname()) + ); + failurePath.failureReason.tenant.ifPresent(tenant -> { + object.setString("failedTenant", tenant.hostname()); + object.setString("failedTenantResources", tenant.flavor().resources().toString()); + tenant.allocation().ifPresent(allocation -> + object.setString("failedTenantAllocation", allocation.toString()) + ); + var explanation = object.setObject("hostCandidateRejectionReasons"); + allocationFailureReasonListToSlime(explanation.setObject("singularReasonFailures"), + failurePath.failureReason.failureReasons.singularReasonFailures()); + allocationFailureReasonListToSlime(explanation.setObject("totalFailures"), + failurePath.failureReason.failureReasons); + }); + var details = object.setObject("details"); + hostLossPossibleToSlime(details, Optional.of(failurePath), failurePath.hostsCausingFailure); + } + + private void allocationFailureReasonListToSlime(Cursor root, CapacityChecker.AllocationFailureReasonList allocationFailureReasonList) { + root.setLong("insufficientVcpu", allocationFailureReasonList.insufficientVcpu()); + root.setLong("insufficientMemoryGb", allocationFailureReasonList.insufficientMemoryGb()); + root.setLong("insufficientDiskGb", allocationFailureReasonList.insufficientDiskGb()); + root.setLong("incompatibleDiskSpeed", allocationFailureReasonList.incompatibleDiskSpeed()); + root.setLong("insufficientAvailableIps", allocationFailureReasonList.insufficientAvailableIps()); + root.setLong("violatesParentHostPolicy", allocationFailureReasonList.violatesParentHostPolicy()); + } + + @Override + public void render(OutputStream stream) throws IOException { + if (json) new JsonFormat(true).encode(stream, slime); + else stream.write(text.toString().getBytes()); + } + + @Override + public String getContentType() { + return json ? "application/json" : "text/plain"; + } +} diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodesApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodesApiHandler.java index 22318f1ddb4..e036124e489 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodesApiHandler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/v2/NodesApiHandler.java @@ -102,6 +102,7 @@ public class NodesApiHandler extends LoggingRequestHandler { if (path.equals( "/nodes/v2/command/")) return ResourcesResponse.fromStrings(request.getUri(), "restart", "reboot"); if (path.equals( "/nodes/v2/maintenance/")) return new JobsResponse(nodeRepository.jobControl()); if (path.equals( "/nodes/v2/upgrade/")) return new UpgradeResponse(nodeRepository.infrastructureVersions(), nodeRepository.osVersions(), nodeRepository.dockerImages()); + if (path.startsWith("/nodes/v2/capacity/")) return new HostCapacityResponse(nodeRepository, request); throw new NotFoundException("Nothing at path '" + path + "'"); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/RestApiTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/RestApiTest.java index bfb24d30284..35fa5adaeff 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/RestApiTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/RestApiTest.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.nio.charset.CharacterCodingException; import java.nio.charset.StandardCharsets; import java.util.Arrays; +import java.util.List; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -819,6 +820,30 @@ public class RestApiTest { "{\"message\":\"Cancelled outstanding requests for firmware checks\"}"); } + @Test + public void test_capacity() throws Exception { + assertFile(new Request("http://localhost:8080/nodes/v2/capacity/?json=true"), "capacity-zone.json"); + + List<String> hostsToRemove = List.of( + "%22dockerhost1.yahoo.com%22", + "%22dockerhost2.yahoo.com%22", + "%22dockerhost3.yahoo.com%22", + "%22dockerhost4.yahoo.com%22" + ); + String requestUriTemplate = + "http://localhost:8080/nodes/v2/capacity/?json=true&hosts=[%s]" + .replaceAll("\\[", "%%5B") + .replaceAll("]", "%%5D"); + + assertFile(new Request(String.format(requestUriTemplate, + String.join(",", hostsToRemove.subList(0, 3)))), + "capacity-hostremoval-possible.json"); + assertFile(new Request(String.format(requestUriTemplate, + String.join(",", hostsToRemove))), + "capacity-hostremoval-impossible.json"); + } + + /** Tests the rendering of each node separately to make it easier to find errors */ @Test public void test_single_node_rendering() throws Exception { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/responses/capacity-hostremoval-impossible.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/responses/capacity-hostremoval-impossible.json new file mode 100644 index 00000000000..f3c73e61c91 --- /dev/null +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/responses/capacity-hostremoval-impossible.json @@ -0,0 +1,20 @@ +{ + "hostsToRemove": [ + "dockerhost1.yahoo.com", + "dockerhost2.yahoo.com", + "dockerhost3.yahoo.com", + "dockerhost4.yahoo.com" + ], + "removalPossible": false, + "history": [ + { + "tenant": "host4.yahoo.com", + "newParent": "dockerhost5.yahoo.com", + "eligibleParents": 1 + }, + { + "tenant": "test-node-pool-101-2", + "eligibleParents": 0 + } + ] +}
\ No newline at end of file diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/responses/capacity-hostremoval-possible.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/responses/capacity-hostremoval-possible.json new file mode 100644 index 00000000000..b896fd9d63a --- /dev/null +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/responses/capacity-hostremoval-possible.json @@ -0,0 +1,20 @@ +{ + "hostsToRemove": [ + "dockerhost1.yahoo.com", + "dockerhost2.yahoo.com", + "dockerhost3.yahoo.com" + ], + "removalPossible": true, + "history": [ + { + "tenant": "host4.yahoo.com", + "newParent": "dockerhost4.yahoo.com", + "eligibleParents": 2 + }, + { + "tenant": "test-node-pool-101-2", + "newParent": "dockerhost5.yahoo.com", + "eligibleParents": 1 + } + ] +}
\ No newline at end of file diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/responses/capacity-zone.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/responses/capacity-zone.json new file mode 100644 index 00000000000..9895948e69d --- /dev/null +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/v2/responses/capacity-zone.json @@ -0,0 +1,46 @@ +{ + "totalHosts": 5, + "couldLoseHosts": 4, + "failedTenantParent": "dockerhost1.yahoo.com", + "failedTenant": "host4.yahoo.com", + "failedTenantResources": "[vcpu: 1.0, memory: 1.0 Gb, disk 100.0 Gb]", + "failedTenantAllocation": "allocated to tenant3.application3.instance3 as 'content/id3/0/0'", + "hostCandidateRejectionReasons": { + "singularReasonFailures": { + "insufficientVcpu": 0, + "insufficientMemoryGb": 0, + "insufficientDiskGb": 0, + "incompatibleDiskSpeed": 0, + "insufficientAvailableIps": 0, + "violatesParentHostPolicy": 1 + }, + "totalFailures": { + "insufficientVcpu": 0, + "insufficientMemoryGb": 0, + "insufficientDiskGb": 0, + "incompatibleDiskSpeed": 0, + "insufficientAvailableIps": 0, + "violatesParentHostPolicy": 1 + } + }, + "details": { + "hostsToRemove": [ + "dockerhost2.yahoo.com", + "dockerhost1.yahoo.com", + "dockerhost4.yahoo.com", + "dockerhost3.yahoo.com" + ], + "removalPossible": false, + "history": [ + { + "tenant": "test-node-pool-101-2", + "newParent": "dockerhost5.yahoo.com", + "eligibleParents": 1 + }, + { + "tenant": "host4.yahoo.com", + "eligibleParents": 0 + } + ] + } +}
\ No newline at end of file |