diff options
author | Jon Bratseth <bratseth@gmail.com> | 2021-07-02 10:54:31 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2021-07-02 10:54:31 +0200 |
commit | 128cd8ceb5b395c12a81b153014e1da757f08c7a (patch) | |
tree | 8dc0df8fabfe9d42a06afa64e2412a71b782662f | |
parent | d6b435b69f8b93b744b71ed38dd5f14734e98ce0 (diff) |
Separate balanced and sparse
3 files changed, 47 insertions, 28 deletions
diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/Group.java b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/Group.java index b6b0c15887f..7be5cde0b14 100644 --- a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/Group.java +++ b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/Group.java @@ -16,16 +16,17 @@ import java.util.logging.Logger; */ public class Group { + private static final Logger log = Logger.getLogger(Group.class.getName()); + private final static double maxContentSkew = 0.10; // If documents on a node is more than 10% off from the average the group is unbalanced + private final static int minDocsPerNodeToRequireLowSkew = 100; + private final int id; private final ImmutableList<Node> nodes; - private final AtomicBoolean hasSufficientCoverage = new AtomicBoolean(true); private final AtomicBoolean hasFullCoverage = new AtomicBoolean(true); private final AtomicLong activeDocuments = new AtomicLong(0); private final AtomicBoolean isBlockingWrites = new AtomicBoolean(false); private final AtomicBoolean isBalanced = new AtomicBoolean(true); - private final static double MAX_UNBALANCE = 0.10; // If documents on a node is more than 10% off from the average the group is unbalanced - private static final Logger log = Logger.getLogger(Group.class.getName()); public Group(int id, List<Node> nodes) { this.id = id; @@ -67,24 +68,22 @@ public class Group { int numWorkingNodes = workingNodes(); if (numWorkingNodes > 0) { long average = activeDocs / numWorkingNodes; - long deviation = nodes.stream().filter(node -> node.isWorking() == Boolean.TRUE).mapToLong(node -> Math.abs(node.getActiveDocuments() - average)).sum(); - boolean isDeviationSmall = deviation <= maxUnbalance(activeDocs); - if ((!isBalanced.get() || isDeviationSmall != isBalanced.get()) && (activeDocs > 0)) { - log.info("Content is " + (isDeviationSmall ? "" : "not ") + "well balanced. Current deviation = " + deviation*100/activeDocs + " %" + - ". activeDocs = " + activeDocs + ", deviation = " + deviation + ", average = " + average); - isBalanced.set(isDeviationSmall); + long skew = nodes.stream().filter(node -> node.isWorking() == Boolean.TRUE).mapToLong(node -> Math.abs(node.getActiveDocuments() - average)).sum(); + boolean skewIsLow = skew <= activeDocs * maxContentSkew; + if (!isBalanced.get() || skewIsLow != isBalanced.get()) { + if (!isSparse()) + log.info("Content is " + (skewIsLow ? "" : "not ") + "well balanced. Current deviation = " + + skew * 100 / activeDocs + " %. activeDocs = " + activeDocs + ", skew = " + skew + + ", average = " + average); + isBalanced.set(skewIsLow); } } else { isBalanced.set(true); } } - double maxUnbalance(long activeDocs) { - return Math.max(1, activeDocs * MAX_UNBALANCE); - } - /** Returns the active documents on this group. If unknown, 0 is returned. */ - long getActiveDocuments() { return activeDocuments.get(); } + long activeDocuments() { return activeDocuments.get(); } /** Returns whether any node in this group is currently blocking write operations */ public boolean isBlockingWrites() { return isBlockingWrites.get(); } @@ -92,7 +91,10 @@ public class Group { /** Returns whether the nodes in the group have about the same number of documents */ public boolean isBalanced() { return isBalanced.get(); } - public boolean isFullCoverageStatusChanged(boolean hasFullCoverageNow) { + /** Returns whether this group has too few documents per node to expect it to be balanced */ + public boolean isSparse() { return activeDocuments.get() / nodes.size() < minDocsPerNodeToRequireLowSkew; } + + public boolean fullCoverageStatusChanged(boolean hasFullCoverageNow) { boolean previousState = hasFullCoverage.getAndSet(hasFullCoverageNow); return previousState != hasFullCoverageNow; } diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java index f80fdb6c4a5..54d5dfc91af 100644 --- a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java +++ b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java @@ -14,13 +14,10 @@ import com.yahoo.search.cluster.NodeManager; import com.yahoo.search.dispatch.TopKEstimator; import com.yahoo.vespa.config.search.DispatchConfig; -import java.util.ArrayList; -import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.OptionalInt; import java.util.concurrent.Executor; import java.util.logging.Logger; import java.util.stream.Collectors; @@ -311,9 +308,9 @@ public class SearchCluster implements NodeManager<Node> { // With just one group sufficient coverage may not be the same as full coverage, as the // group will always be marked sufficient for use. updateSufficientCoverage(group, true); - boolean sufficientCoverage = isGroupCoverageSufficient(group.getActiveDocuments(), - group.getActiveDocuments()); - trackGroupCoverageChanges(group, sufficientCoverage, group.getActiveDocuments()); + boolean sufficientCoverage = isGroupCoverageSufficient(group.activeDocuments(), + group.activeDocuments()); + trackGroupCoverageChanges(group, sufficientCoverage, group.activeDocuments()); } private void pingIterationCompletedMultipleGroups() { @@ -321,7 +318,7 @@ public class SearchCluster implements NodeManager<Node> { long medianDocuments = medianDocumentsPerGroup(); boolean anyGroupsSufficientCoverage = false; for (Group group : orderedGroups()) { - boolean sufficientCoverage = isGroupCoverageSufficient(group.getActiveDocuments(), + boolean sufficientCoverage = isGroupCoverageSufficient(group.activeDocuments(), medianDocuments); anyGroupsSufficientCoverage = anyGroupsSufficientCoverage || sufficientCoverage; updateSufficientCoverage(group, sufficientCoverage); @@ -331,7 +328,7 @@ public class SearchCluster implements NodeManager<Node> { private long medianDocumentsPerGroup() { if (orderedGroups().isEmpty()) return 0; - var activeDocuments = orderedGroups().stream().map(Group::getActiveDocuments).collect(Collectors.toList()); + var activeDocuments = orderedGroups().stream().map(Group::activeDocuments).collect(Collectors.toList()); return (long)Quantiles.median().compute(activeDocuments); } @@ -369,12 +366,12 @@ public class SearchCluster implements NodeManager<Node> { private void trackGroupCoverageChanges(Group group, boolean fullCoverage, long medianDocuments) { if ( ! hasInformationAboutAllNodes()) return; // Be silent until we know what we are talking about. - boolean changed = group.isFullCoverageStatusChanged(fullCoverage); + boolean changed = group.fullCoverageStatusChanged(fullCoverage); if (changed || (!fullCoverage && System.currentTimeMillis() > nextLogTime)) { nextLogTime = System.currentTimeMillis() + 30 * 1000; if (fullCoverage) { log.info("Cluster " + clusterId + ": " + group + " has full coverage. " + - "Active documents: " + group.getActiveDocuments() + "/" + medianDocuments + ", " + + "Active documents: " + group.activeDocuments() + "/" + medianDocuments + ", " + "working nodes: " + group.workingNodes() + "/" + group.nodes().size()); } else { StringBuilder unresponsive = new StringBuilder(); @@ -383,7 +380,7 @@ public class SearchCluster implements NodeManager<Node> { unresponsive.append('\n').append(node); } log.warning("Cluster " + clusterId + ": " + group + " has reduced coverage: " + - "Active documents: " + group.getActiveDocuments() + "/" + medianDocuments + ", " + + "Active documents: " + group.activeDocuments() + "/" + medianDocuments + ", " + "working nodes: " + group.workingNodes() + "/" + group.nodes().size() + ", unresponsive nodes: " + (unresponsive.toString().isEmpty() ? " none" : unresponsive)); } diff --git a/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterCoverageTest.java b/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterCoverageTest.java index ee51b983d64..8101aee74fd 100644 --- a/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterCoverageTest.java +++ b/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterCoverageTest.java @@ -101,7 +101,7 @@ public class SearchClusterCoverageTest { } @Test - public void one_group_few_docs_has_well_balanced_content() { + public void one_group_few_docs_unbalanced() { var tester = new SearchClusterTester(1, 2); Node node0 = tester.group(0).nodes().get(0); @@ -115,7 +115,27 @@ public class SearchClusterCoverageTest { node1.setActiveDocuments(0); tester.pingIterationCompleted(); - assertTrue(tester.group(0).isBalanced()); + assertFalse(tester.group(0).isBalanced()); + assertTrue(tester.group(0).isSparse()); + } + + @Test + public void one_group_many_docs_unbalanced() { + var tester = new SearchClusterTester(1, 2); + + Node node0 = tester.group(0).nodes().get(0); + Node node1 = tester.group(0).nodes().get(1); + + // 1 document + node0.setWorking(true); + node1.setWorking(true); + + node0.setActiveDocuments(1000000); + node1.setActiveDocuments(100000); + + tester.pingIterationCompleted(); + assertFalse(tester.group(0).isBalanced()); + assertFalse(tester.group(0).isSparse()); } } |