diff options
5 files changed, 30 insertions, 16 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java index c707cfc08ae..26fadbf7e91 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java @@ -22,6 +22,7 @@ import com.yahoo.vespa.hosted.provision.persistence.DnsNameResolver; import com.yahoo.vespa.hosted.provision.persistence.NameResolver; import java.time.Clock; +import java.time.Duration; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -466,6 +467,9 @@ public class NodeRepository extends AbstractComponent { /** Create a lock which provides exclusive rights to making changes to the given application */ public Mutex lock(ApplicationId application) { return zkClient.lock(application); } + /** Create a lock with a timeout which provides exclusive rights to making changes to the given application */ + public Mutex lock(ApplicationId application, Duration timeout) { return zkClient.lock(application, timeout); } + /** Create a lock which provides exclusive rights to changing the set of ready nodes */ public Mutex lockUnallocated() { return zkClient.lockInactive(); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java index 2d5b5be4f37..00f212ea1d4 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java @@ -47,8 +47,10 @@ public class ApplicationMaintainer extends Maintainer { for (ApplicationId application : applications) { try { // An application might change it's state between the time the set of applications is retrieved and the - // time deployment happens. Lock on application and check if it's still active - try (Mutex lock = nodeRepository().lock(application)) { + // time deployment happens. Lock on application and check if it's still active. + // + // Lock is acquired with a low timeout to reduce the chance of colliding with an external deployment. + try (Mutex lock = nodeRepository().lock(application, Duration.ofSeconds(1))) { if (isApplicationActive(application)) { Optional<Deployment> deployment = deployer.deployFromLocalActive(application, Duration.ofMinutes(30)); if ( ! deployment.isPresent()) continue; // this will be done at another config server diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabase.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabase.java index b1fb7f53616..b2ffe05b2d4 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabase.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabase.java @@ -8,6 +8,7 @@ import com.yahoo.vespa.curator.Curator; import com.yahoo.vespa.curator.recipes.CuratorCounter; import com.yahoo.vespa.curator.transaction.CuratorTransaction; +import java.time.Duration; import java.util.Arrays; import java.util.List; import java.util.Map; @@ -16,7 +17,7 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicReference; /** - * Thius encapsulated the curator database of the node repo. + * This encapsulated the curator database of the node repo. * It serves reads from an in-memory cache of the content which is invalidated when changed on another node * using a global, shared counter. The counter is updated on all write operations, ensured by wrapping write * operations in a 2pc transaction containing the counter update. @@ -57,9 +58,9 @@ public class CuratorDatabase { /** Create a reentrant lock */ // Locks are not cached in the in-memory state - public CuratorMutex lock(Path path) { + public CuratorMutex lock(Path path, Duration timeout) { CuratorMutex lock = locks.computeIfAbsent(path, (pathArg) -> new CuratorMutex(pathArg.getAbsolute(), curator.framework())); - lock.acquire(); + lock.acquire(timeout); return lock; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java index 5218f92c5ae..dc6bda01619 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java @@ -5,6 +5,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.datatype.joda.JodaModule; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.Environment; +import com.yahoo.config.provision.NodeFlavors; import com.yahoo.config.provision.Zone; import com.yahoo.log.LogLevel; import com.yahoo.path.Path; @@ -13,11 +14,10 @@ import com.yahoo.vespa.curator.Curator; import com.yahoo.vespa.curator.transaction.CuratorOperations; import com.yahoo.vespa.curator.transaction.CuratorTransaction; import com.yahoo.vespa.hosted.provision.Node; -import com.yahoo.vespa.hosted.provision.node.History; -import com.yahoo.config.provision.NodeFlavors; import com.yahoo.vespa.hosted.provision.node.Status; import java.time.Clock; +import java.time.Duration; import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -39,6 +39,8 @@ public class CuratorDatabaseClient { private static final Path root = Path.fromString("/provision/v1"); + private static final Duration defaultLockTimeout = Duration.ofMinutes(1); + private final NodeSerializer nodeSerializer; private final CuratorDatabase curatorDatabase; @@ -262,17 +264,21 @@ public class CuratorDatabaseClient { /** Acquires the single cluster-global, reentrant lock for all non-active nodes */ public CuratorMutex lockInactive() { - return lock(root.append("locks").append("unallocatedLock")); + return lock(root.append("locks").append("unallocatedLock"), defaultLockTimeout); } /** Acquires the single cluster-global, reentrant lock for active nodes of this application */ public CuratorMutex lock(ApplicationId application) { - return lock(lockPath(application)); + return lock(lockPath(application), defaultLockTimeout); } - /** Acquires the single cluster-global, reentrant lock for all non-active nodes */ - public CuratorMutex lock(Path path) { - return curatorDatabase.lock(path); + /** Acquires the single cluster-global, reentrant lock with the specified timeout for active nodes of this application */ + public CuratorMutex lock(ApplicationId application, Duration timeout) { + return lock(lockPath(application), timeout); + } + + private CuratorMutex lock(Path path, Duration timeout) { + return curatorDatabase.lock(path, timeout); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorMutex.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorMutex.java index 6f8a7aae3d5..62be5c649db 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorMutex.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorMutex.java @@ -5,6 +5,7 @@ import com.yahoo.transaction.Mutex; import org.apache.curator.framework.CuratorFramework; import org.apache.curator.framework.recipes.locks.InterProcessMutex; +import java.time.Duration; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; @@ -23,12 +24,12 @@ public class CuratorMutex implements Mutex { mutex = new InterProcessMutex(curator, lockPath); } - /** Take the lock. This may be called multiple times from the same thread - each matched by a close */ - public void acquire() { + /** Take the lock with the given timeout. This may be called multiple times from the same thread - each matched by a close */ + public void acquire(Duration timeout) { try { - boolean acquired = mutex.acquire(60, TimeUnit.SECONDS); + boolean acquired = mutex.acquire(timeout.toMillis(), TimeUnit.MILLISECONDS); if ( ! acquired) { - throw new TimeoutException("Timed out after waiting 60 seconds"); + throw new TimeoutException("Timed out after waiting " + timeout.toString()); } } catch (Exception e) { |