diff options
author | Harald Musum <musum@yahooinc.com> | 2022-04-19 14:09:21 +0200 |
---|---|---|
committer | Harald Musum <musum@yahooinc.com> | 2022-04-19 14:09:21 +0200 |
commit | 9ebad8ca92af962da93731b7054f5e5302b0aaf2 (patch) | |
tree | d11befad5ed555e2ce8e81784edb066e08ae5dec /node-repository | |
parent | 1e4a7bd6c22f52d7c6cd57b6f48dbfa5a89388b5 (diff) |
Use timeout when getting locks with NodePatcher
Client has a timeout of 10 seconds, using a much larger timeout
(as is the default) might lead to lots of requests waiting for the
lock
Diffstat (limited to 'node-repository')
3 files changed, 28 insertions, 4 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java index d544ea76983..3aa09b1b667 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java @@ -773,12 +773,15 @@ public class Nodes { public Mutex lockUnallocated() { return db.lockInactive(); } /** Returns the unallocated/application lock, and the node acquired under that lock. */ - public Optional<NodeMutex> lockAndGet(Node node) { + public Optional<NodeMutex> lockAndGet(Node node) { return lockAndGet(node, Optional.empty()); } + + /** Returns the unallocated/application lock, and the node acquired under that lock. */ + public Optional<NodeMutex> lockAndGet(Node node, Optional<Duration> timeout) { Node staleNode = node; final int maxRetries = 4; for (int i = 0; i < maxRetries; ++i) { - Mutex lockToClose = lock(staleNode); + Mutex lockToClose = timeout.isPresent() ? lock(staleNode, timeout.get()) : lock(staleNode); try { // As an optimization we first try finding the node in the same state Optional<Node> freshNode = node(staleNode.hostname(), staleNode.state()); @@ -813,6 +816,11 @@ public class Nodes { } /** Returns the unallocated/application lock, and the node acquired under that lock. */ + public Optional<NodeMutex> lockAndGet(String hostname, Duration timeout) { + return node(hostname).flatMap(node -> lockAndGet(node, Optional.of(timeout))); + } + + /** Returns the unallocated/application lock, and the node acquired under that lock. */ public NodeMutex lockAndGetRequired(Node node) { return lockAndGet(node).orElseThrow(() -> new NoSuchNodeException("No node with hostname '" + node.hostname() + "'")); } @@ -822,10 +830,19 @@ public class Nodes { return lockAndGet(hostname).orElseThrow(() -> new NoSuchNodeException("No node with hostname '" + hostname + "'")); } + /** Returns the unallocated/application lock, and the node acquired under that lock. */ + public NodeMutex lockAndGetRequired(String hostname, Duration timeout) { + return lockAndGet(hostname, timeout).orElseThrow(() -> new NoSuchNodeException("No node with hostname '" + hostname + "'")); + } + private Mutex lock(Node node) { return node.allocation().isPresent() ? lock(node.allocation().get().owner()) : lockUnallocated(); } + private Mutex lock(Node node, Duration timeout) { + return node.allocation().isPresent() ? lock(node.allocation().get().owner(), timeout) : lockUnallocated(); + } + private Node requireNode(String hostname) { return node(hostname).orElseThrow(() -> new NoSuchNodeException("No node with hostname '" + hostname + "'")); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java index a9abc352d8c..13dd458c041 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java @@ -69,6 +69,7 @@ public class CuratorDatabaseClient { private static final Path firmwareCheckPath = root.append("firmwareCheck"); private static final Path archiveUrisPath = root.append("archiveUris"); + // TODO: Explain reasoning behind timeout value (why its it as high as 10 minutes?) private static final Duration defaultLockTimeout = Duration.ofMinutes(10); private final NodeSerializer nodeSerializer; @@ -319,7 +320,12 @@ public class CuratorDatabaseClient { /** Acquires the single cluster-global, reentrant lock for all non-active nodes */ public Lock lockInactive() { - return db.lock(lockPath.append("unallocatedLock"), defaultLockTimeout); + return lockInactive(defaultLockTimeout); + } + + /** Acquires the single cluster-global, reentrant lock for all non-active nodes */ + public Lock lockInactive(Duration timeout) { + return db.lock(lockPath.append("unallocatedLock"), timeout); } /** Acquires the single cluster-global, reentrant lock for active nodes of this application */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodePatcher.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodePatcher.java index fa6c44e6851..d8e1828b10c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodePatcher.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodePatcher.java @@ -29,6 +29,7 @@ import com.yahoo.yolean.Exceptions; import java.io.InputStream; import java.time.Clock; +import java.time.Duration; import java.time.Instant; import java.util.ArrayList; import java.util.HashMap; @@ -86,7 +87,7 @@ public class NodePatcher { Map<String, Inspector> recursiveFields = Maps.filterKeys(fields, RECURSIVE_FIELDS::contains); // Patch - NodeMutex nodeMutex = nodeRepository.nodes().lockAndGetRequired(hostname); + NodeMutex nodeMutex = nodeRepository.nodes().lockAndGetRequired(hostname, Duration.ofSeconds(10)); // timeout should match the one used by clients patch(nodeMutex, regularFields, root, false); patchIpConfig(hostname, ipConfigFields); if (nodeMutex.node().type().isHost()) { |