aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHÃ¥kon Hallingstad <hakon@verizonmedia.com>2020-03-07 17:00:19 +0100
committerGitHub <noreply@github.com>2020-03-07 17:00:19 +0100
commit3eb6aa9161ca016d8db3914970054d5519052c45 (patch)
tree49ffc47b8092dc81508da8b40ec7ff88ac64b275
parent71cf26a0abe0b62b9d3038c86351e69a3dee640e (diff)
parent92a2c10c684ee128b25e2128433451db438fc14a (diff)
Merge pull request #12496 from vespa-engine/hakonhall/throw-if-accessing-duper-model-while-holding-status-service-application-lock
Throw if accessing duper model while holding status service application lock
-rw-r--r--orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZkStatusService.java18
-rw-r--r--orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyAntiServiceMonitor.java15
-rw-r--r--orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyServiceMonitor.java9
-rw-r--r--orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorImplTest.java14
-rw-r--r--orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ModelTestUtils.java9
-rw-r--r--orchestrator/src/test/java/com/yahoo/vespa/orchestrator/resources/HostResourceTest.java6
-rw-r--r--orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusService2Test.java35
-rw-r--r--orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusServiceTest.java28
-rw-r--r--service-monitor/src/main/java/com/yahoo/vespa/service/duper/CriticalRegionChecker.java63
-rw-r--r--service-monitor/src/main/java/com/yahoo/vespa/service/duper/DuperModelManager.java92
-rw-r--r--service-monitor/src/main/java/com/yahoo/vespa/service/model/ServiceMonitorImpl.java9
-rw-r--r--service-monitor/src/main/java/com/yahoo/vespa/service/monitor/AntiServiceMonitor.java25
-rw-r--r--service-monitor/src/main/java/com/yahoo/vespa/service/monitor/CriticalRegion.java18
13 files changed, 289 insertions, 52 deletions
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZkStatusService.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZkStatusService.java
index 9874903e976..3bf0d886a41 100644
--- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZkStatusService.java
+++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZkStatusService.java
@@ -16,6 +16,8 @@ import com.yahoo.vespa.flags.FlagSource;
import com.yahoo.vespa.flags.Flags;
import com.yahoo.vespa.orchestrator.OrchestratorContext;
import com.yahoo.vespa.orchestrator.OrchestratorUtil;
+import com.yahoo.vespa.service.monitor.AntiServiceMonitor;
+import com.yahoo.vespa.service.monitor.CriticalRegion;
import org.apache.zookeeper.data.Stat;
import javax.inject.Inject;
@@ -46,6 +48,7 @@ public class ZkStatusService implements StatusService {
private final Metric metric;
private final Timer timer;
private final boolean doCleanup;
+ private final AntiServiceMonitor antiServiceMonitor;
/**
* A cache of metric contexts for each possible dimension map. In practice, there is one dimension map
@@ -58,21 +61,25 @@ public class ZkStatusService implements StatusService {
@Component Curator curator,
@Component Metric metric,
@Component Timer timer,
- @Component FlagSource flagSource) {
+ @Component FlagSource flagSource,
+ @Component AntiServiceMonitor antiServiceMonitor) {
this(curator,
metric,
timer,
new HostInfosCache(curator, new HostInfosServiceImpl(curator, timer)),
- Flags.CLEANUP_STATUS_SERVICE.bindTo(flagSource).value());
+ Flags.CLEANUP_STATUS_SERVICE.bindTo(flagSource).value(),
+ antiServiceMonitor);
}
/** Non-private for testing only. */
- ZkStatusService(Curator curator, Metric metric, Timer timer, HostInfosCache hostInfosCache, boolean doCleanup) {
+ ZkStatusService(Curator curator, Metric metric, Timer timer, HostInfosCache hostInfosCache,
+ boolean doCleanup, AntiServiceMonitor antiServiceMonitor) {
this.curator = curator;
this.metric = metric;
this.timer = timer;
this.hostInfosCache = hostInfosCache;
this.doCleanup = doCleanup;
+ this.antiServiceMonitor = antiServiceMonitor;
}
@Override
@@ -186,6 +193,9 @@ public class ZkStatusService implements StatusService {
metric.add(acquireResultMetricName, 1, metricContext);
}
+ CriticalRegion inaccessibleDuperModelRegion = antiServiceMonitor
+ .disallowDuperModelLockAcquisition(ZkStatusService.class.getSimpleName() + " application lock");
+
return () -> {
try {
lock.close();
@@ -197,6 +207,8 @@ public class ZkStatusService implements StatusService {
e);
}
+ inaccessibleDuperModelRegion.close();
+
Instant lockReleasedTime = timer.currentTime();
double seconds = durationInSeconds(acquireEndTime, lockReleasedTime);
metric.set("orchestrator.lock.hold-latency", seconds, metricContext);
diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyAntiServiceMonitor.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyAntiServiceMonitor.java
new file mode 100644
index 00000000000..d22d99b5c9c
--- /dev/null
+++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyAntiServiceMonitor.java
@@ -0,0 +1,15 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.orchestrator;
+
+import com.yahoo.vespa.service.monitor.AntiServiceMonitor;
+import com.yahoo.vespa.service.monitor.CriticalRegion;
+
+/**
+ * @author hakonhall
+ */
+public class DummyAntiServiceMonitor implements AntiServiceMonitor {
+ @Override
+ public CriticalRegion disallowDuperModelLockAcquisition(String regionDescription) {
+ return () -> {};
+ }
+}
diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyServiceMonitor.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyServiceMonitor.java
index 0da4c48d408..a784187fb62 100644
--- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyServiceMonitor.java
+++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyServiceMonitor.java
@@ -15,6 +15,8 @@ import com.yahoo.vespa.applicationmodel.ServiceType;
import com.yahoo.vespa.applicationmodel.TenantId;
import com.yahoo.vespa.orchestrator.model.NodeGroup;
import com.yahoo.vespa.orchestrator.model.VespaModelUtil;
+import com.yahoo.vespa.service.monitor.AntiServiceMonitor;
+import com.yahoo.vespa.service.monitor.CriticalRegion;
import com.yahoo.vespa.service.monitor.ServiceModel;
import com.yahoo.vespa.service.monitor.ServiceMonitor;
@@ -31,7 +33,7 @@ import java.util.stream.Collectors;
* @author oyving
* @author smorgrav
*/
-public class DummyServiceMonitor implements ServiceMonitor {
+public class DummyServiceMonitor implements ServiceMonitor, AntiServiceMonitor {
public static final HostName TEST1_HOST_NAME = new HostName("test1.hostname.tld");
public static final HostName TEST3_HOST_NAME = new HostName("test3.hostname.tld");
@@ -165,6 +167,11 @@ public class DummyServiceMonitor implements ServiceMonitor {
throw new UnsupportedOperationException();
}
+ @Override
+ public CriticalRegion disallowDuperModelLockAcquisition(String regionDescription) {
+ return () -> {};
+ }
+
public static Set<HostName> getContentHosts(ApplicationInstanceReference appRef) {
Set<HostName> hosts = apps.stream()
.filter(application -> application.reference().equals(appRef))
diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorImplTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorImplTest.java
index 44e11fb794b..f49dc490fe6 100644
--- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorImplTest.java
+++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorImplTest.java
@@ -76,7 +76,12 @@ public class OrchestratorImplTest {
private final ApplicationApiFactory applicationApiFactory = new ApplicationApiFactory(3);
private final InMemoryFlagSource flagSource = new InMemoryFlagSource();
private final MockCurator curator = new MockCurator();
- private ZkStatusService statusService = new ZkStatusService(curator, mock(Metric.class), new TestTimer(), flagSource);
+ private ZkStatusService statusService = new ZkStatusService(
+ curator,
+ mock(Metric.class),
+ new TestTimer(),
+ flagSource,
+ new DummyAntiServiceMonitor());
private ApplicationId app1;
private ApplicationId app2;
@@ -394,7 +399,12 @@ public class OrchestratorImplTest {
@Test
public void testGetHost() throws Exception {
ClusterControllerClientFactory clusterControllerClientFactory = new ClusterControllerClientFactoryMock();
- StatusService statusService = new ZkStatusService(new MockCurator(), mock(Metric.class), new TestTimer(), flagSource);
+ StatusService statusService = new ZkStatusService(
+ new MockCurator(),
+ mock(Metric.class),
+ new TestTimer(),
+ flagSource,
+ new DummyAntiServiceMonitor());
HostName hostName = new HostName("host.yahoo.com");
TenantId tenantId = new TenantId("tenant");
diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ModelTestUtils.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ModelTestUtils.java
index e6a14f75b24..88e9d1c90fe 100644
--- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ModelTestUtils.java
+++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ModelTestUtils.java
@@ -17,6 +17,7 @@ import com.yahoo.vespa.applicationmodel.ServiceType;
import com.yahoo.vespa.applicationmodel.TenantId;
import com.yahoo.vespa.curator.mock.MockCurator;
import com.yahoo.vespa.flags.InMemoryFlagSource;
+import com.yahoo.vespa.orchestrator.DummyAntiServiceMonitor;
import com.yahoo.vespa.orchestrator.OrchestrationException;
import com.yahoo.vespa.orchestrator.Orchestrator;
import com.yahoo.vespa.orchestrator.OrchestratorContext;
@@ -57,9 +58,13 @@ class ModelTestUtils {
private final Map<ApplicationInstanceReference, ApplicationInstance> applications = new HashMap<>();
private final ClusterControllerClientFactory clusterControllerClientFactory = new ClusterControllerClientFactoryMock();
private final Map<HostName, HostStatus> hostStatusMap = new HashMap<>();
- private final StatusService statusService = new ZkStatusService(new MockCurator(), mock(Metric.class), new TestTimer(), flagSource);
- private final TestTimer timer = new TestTimer();
private final ServiceMonitor serviceMonitor = () -> new ServiceModel(applications);
+ private final StatusService statusService = new ZkStatusService(
+ new MockCurator(),
+ mock(Metric.class),
+ new TestTimer(),
+ flagSource,
+ new DummyAntiServiceMonitor());
private final Orchestrator orchestrator = new OrchestratorImpl(new HostedVespaPolicy(new HostedVespaClusterPolicy(), clusterControllerClientFactory, applicationApiFactory()),
clusterControllerClientFactory,
statusService,
diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/resources/HostResourceTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/resources/HostResourceTest.java
index 1bf3b267a05..ffb79b4edf6 100644
--- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/resources/HostResourceTest.java
+++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/resources/HostResourceTest.java
@@ -19,6 +19,7 @@ import com.yahoo.vespa.curator.mock.MockCurator;
import com.yahoo.vespa.flags.InMemoryFlagSource;
import com.yahoo.vespa.orchestrator.BatchHostNameNotFoundException;
import com.yahoo.vespa.orchestrator.BatchInternalErrorException;
+import com.yahoo.vespa.orchestrator.DummyAntiServiceMonitor;
import com.yahoo.vespa.orchestrator.Host;
import com.yahoo.vespa.orchestrator.HostNameNotFoundException;
import com.yahoo.vespa.orchestrator.OrchestrationException;
@@ -79,11 +80,12 @@ public class HostResourceTest {
private static final int SERVICE_MONITOR_CONVERGENCE_LATENCY_SECONDS = 0;
private static final TenantId TENANT_ID = new TenantId("tenantId");
private static final ApplicationInstanceId APPLICATION_INSTANCE_ID = new ApplicationInstanceId("applicationId");
+ private static final ServiceMonitor serviceMonitor = mock(ServiceMonitor.class);
private static final StatusService EVERY_HOST_IS_UP_HOST_STATUS_SERVICE = new ZkStatusService(
- new MockCurator(), mock(Metric.class), new TestTimer(), new InMemoryFlagSource());
+ new MockCurator(), mock(Metric.class), new TestTimer(), new InMemoryFlagSource(),
+ new DummyAntiServiceMonitor());
private static final ApplicationApiFactory applicationApiFactory = new ApplicationApiFactory(3);
- private static final ServiceMonitor serviceMonitor = mock(ServiceMonitor.class);
static {
when(serviceMonitor.getApplication(any(HostName.class)))
.thenReturn(Optional.of(
diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusService2Test.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusService2Test.java
index 4c01572f787..e8cfe15611e 100644
--- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusService2Test.java
+++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusService2Test.java
@@ -9,6 +9,8 @@ import com.yahoo.vespa.applicationmodel.ApplicationInstanceReference;
import com.yahoo.vespa.applicationmodel.TenantId;
import com.yahoo.vespa.curator.Curator;
import com.yahoo.vespa.orchestrator.OrchestratorContext;
+import com.yahoo.vespa.service.monitor.AntiServiceMonitor;
+import com.yahoo.vespa.service.monitor.CriticalRegion;
import org.apache.curator.framework.recipes.locks.InterProcessMutex;
import org.junit.Test;
import org.mockito.ArgumentCaptor;
@@ -32,7 +34,10 @@ public class ZkStatusService2Test {
private final Timer timer = new TestTimer();
private final Metric metric = mock(Metric.class);
private final HostInfosCache cache = mock(HostInfosCache.class);
- private final ZkStatusService zkStatusService = new ZkStatusService(curator, metric, timer, cache, false);
+ private final CriticalRegion criticalRegion = mock(CriticalRegion.class);
+ private final AntiServiceMonitor antiServiceMonitor = mock(AntiServiceMonitor.class);
+ private final ZkStatusService zkStatusService =
+ new ZkStatusService(curator, metric, timer, cache, false, antiServiceMonitor);
private final OrchestratorContext context = mock(OrchestratorContext.class);
private final InterProcessMutex mutex = mock(InterProcessMutex.class);
@@ -47,34 +52,43 @@ public class ZkStatusService2Test {
when(curator.createMutex(any())).thenReturn(mutex);
when(mutex.acquire(anyLong(), any())).thenReturn(true);
+ when(antiServiceMonitor.disallowDuperModelLockAcquisition(any())).thenReturn(criticalRegion);
when(context.getTimeLeft()).thenReturn(Duration.ofSeconds(12));
+ verify(antiServiceMonitor, times(0)).disallowDuperModelLockAcquisition(any());
try (ApplicationLock lock = zkStatusService.lockApplication(context, reference)) {
- // nothing
+ verify(antiServiceMonitor, times(1)).disallowDuperModelLockAcquisition(any());
+
+ verify(criticalRegion, times(0)).close();
}
+ verify(criticalRegion, times(1)).close();
verify(curator, times(1)).createMutex(any());
verify(mutex, times(1)).acquire(anyLong(), any());
verify(mutex, times(1)).release();
verify(context, times(1)).hasLock(any());
verify(context, times(1)).registerLockAcquisition(any(), any());
- verifyNoMoreInteractions(mutex);
+ verifyNoMoreInteractions(mutex, antiServiceMonitor, criticalRegion);
// Now the non-probe suspension
when(context.isProbe()).thenReturn(false);
+ verify(antiServiceMonitor, times(1)).disallowDuperModelLockAcquisition(any());
try (ApplicationLock lock = zkStatusService.lockApplication(context, reference)) {
- // nothing
+ verify(antiServiceMonitor, times(2)).disallowDuperModelLockAcquisition(any());
+
+ verify(criticalRegion, times(1)).close();
}
+ verify(criticalRegion, times(2)).close();
verify(mutex, times(2)).acquire(anyLong(), any());
verify(mutex, times(2)).release();
ArgumentCaptor<Runnable> runnableCaptor = ArgumentCaptor.forClass(Runnable.class);
verify(context, times(2)).hasLock(any());
verify(context, times(2)).registerLockAcquisition(any(), any());
- verifyNoMoreInteractions(mutex);
+ verifyNoMoreInteractions(mutex, antiServiceMonitor, criticalRegion);
}
@Test
@@ -85,6 +99,7 @@ public class ZkStatusService2Test {
when(curator.createMutex(any())).thenReturn(mutex);
when(mutex.acquire(anyLong(), any())).thenReturn(true);
+ when(antiServiceMonitor.disallowDuperModelLockAcquisition(any())).thenReturn(criticalRegion);
when(context.getTimeLeft()).thenReturn(Duration.ofSeconds(12));
@@ -97,7 +112,9 @@ public class ZkStatusService2Test {
verify(mutex, times(0)).release();
verify(context, times(1)).hasLock(any());
verify(context, times(1)).registerLockAcquisition(any(), any());
- verifyNoMoreInteractions(mutex);
+ verify(antiServiceMonitor, times(1)).disallowDuperModelLockAcquisition(any());
+ verify(criticalRegion, times(0)).close();
+ verifyNoMoreInteractions(mutex, antiServiceMonitor, criticalRegion);
// Now the non-probe suspension
@@ -114,7 +131,9 @@ public class ZkStatusService2Test {
verify(mutex, times(0)).release();
verify(context, times(2)).hasLock(any());
verify(context, times(1)).registerLockAcquisition(any(), any());
- verifyNoMoreInteractions(mutex);
+ verify(antiServiceMonitor, times(1)).disallowDuperModelLockAcquisition(any());
+ verify(criticalRegion, times(0)).close();
+ verifyNoMoreInteractions(mutex, antiServiceMonitor, criticalRegion);
// Verify the context runnable releases the mutex
ArgumentCaptor<Runnable> runnableCaptor = ArgumentCaptor.forClass(Runnable.class);
@@ -123,5 +142,7 @@ public class ZkStatusService2Test {
runnableCaptor.getAllValues().forEach(Runnable::run);
verify(mutex, times(1)).acquire(anyLong(), any());
verify(mutex, times(1)).release();
+ verify(criticalRegion, times(1)).close();
+ verifyNoMoreInteractions(mutex, antiServiceMonitor, criticalRegion);
}
} \ No newline at end of file
diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusServiceTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusServiceTest.java
index 3c3c337f4e1..9fe910079cf 100644
--- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusServiceTest.java
+++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusServiceTest.java
@@ -16,6 +16,9 @@ import com.yahoo.vespa.flags.InMemoryFlagSource;
import com.yahoo.vespa.orchestrator.OrchestratorContext;
import com.yahoo.vespa.orchestrator.OrchestratorUtil;
import com.yahoo.vespa.orchestrator.TestIds;
+import com.yahoo.vespa.service.monitor.AntiServiceMonitor;
+import com.yahoo.vespa.service.monitor.CriticalRegion;
+import com.yahoo.vespa.service.monitor.ServiceMonitor;
import com.yahoo.yolean.Exceptions;
import org.apache.curator.framework.CuratorFramework;
import org.apache.curator.test.KillSession;
@@ -57,6 +60,7 @@ import static org.junit.Assert.fail;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
@@ -68,7 +72,10 @@ public class ZkStatusServiceTest {
private final Timer timer = mock(Timer.class);
private final Metric metric = mock(Metric.class);
private final OrchestratorContext context = mock(OrchestratorContext.class);
- private InMemoryFlagSource flagSource = new InMemoryFlagSource();
+ private final InMemoryFlagSource flagSource = new InMemoryFlagSource();
+ private final ServiceMonitor serviceMonitor = mock(ServiceMonitor.class);
+ private final CriticalRegion criticalRegion = mock(CriticalRegion.class);
+ private final AntiServiceMonitor antiServiceMonitor = mock(AntiServiceMonitor.class);
@Captor
private ArgumentCaptor<Map<String, String>> captor;
@@ -78,11 +85,12 @@ public class ZkStatusServiceTest {
testingServer = new TestingServer();
curator = createConnectedCurator(testingServer);
- statusService = new ZkStatusService(curator, metric, timer, flagSource);
+ statusService = new ZkStatusService(curator, metric, timer, flagSource, antiServiceMonitor);
when(context.getTimeLeft()).thenReturn(Duration.ofSeconds(10));
when(context.isProbe()).thenReturn(false);
when(timer.currentTime()).thenReturn(Instant.ofEpochMilli(1));
when(timer.toUtcClock()).thenReturn(new ManualClock(Instant.ofEpochMilli(1)));
+ when(antiServiceMonitor.disallowDuperModelLockAcquisition(any())).thenReturn(criticalRegion);
}
private static Curator createConnectedCurator(TestingServer server) throws InterruptedException {
@@ -151,7 +159,7 @@ public class ZkStatusServiceTest {
public void locks_are_exclusive() throws Exception {
setUp();
ZkStatusService zkStatusService2 =
- new ZkStatusService(curator, mock(Metric.class), new TestTimer(), flagSource);
+ new ZkStatusService(curator, mock(Metric.class), new TestTimer(), flagSource, antiServiceMonitor);
final CompletableFuture<Void> lockedSuccessfullyFuture;
try (ApplicationLock lock = statusService
@@ -178,7 +186,7 @@ public class ZkStatusServiceTest {
public void failing_to_get_lock_closes_SessionFailRetryLoop() throws Exception {
setUp();
ZkStatusService zkStatusService2 =
- new ZkStatusService(curator, mock(Metric.class), new TestTimer(), flagSource);
+ new ZkStatusService(curator, mock(Metric.class), new TestTimer(), flagSource, antiServiceMonitor);
try (ApplicationLock lock = statusService
.lockApplication(context, TestIds.APPLICATION_INSTANCE_REFERENCE)) {
@@ -309,16 +317,28 @@ public class ZkStatusServiceTest {
HostName strayHostname = new HostName("stray1.com");
+ verify(antiServiceMonitor, times(0)).disallowDuperModelLockAcquisition(any());
try (ApplicationLock lock = statusService.lockApplication(context, TestIds.APPLICATION_INSTANCE_REFERENCE)) {
+ verify(antiServiceMonitor, times(1)).disallowDuperModelLockAcquisition(any());
+
lock.setApplicationInstanceStatus(ApplicationInstanceStatus.ALLOWED_TO_BE_DOWN);
lock.setHostState(TestIds.HOST_NAME1, HostStatus.ALLOWED_TO_BE_DOWN);
lock.setHostState(strayHostname, HostStatus.PERMANENTLY_DOWN);
+
+ verify(criticalRegion, times(0)).close();
}
+ verify(criticalRegion, times(1)).close();
+ verify(antiServiceMonitor, times(1)).disallowDuperModelLockAcquisition(any());
try (ApplicationLock lock = statusService.lockApplication(context, TestIds.APPLICATION_INSTANCE_REFERENCE2)) {
+ verify(antiServiceMonitor, times(2)).disallowDuperModelLockAcquisition(any());
+
lock.setApplicationInstanceStatus(ApplicationInstanceStatus.ALLOWED_TO_BE_DOWN);
+
+ verify(criticalRegion, times(1)).close();
}
+ verify(criticalRegion, times(2)).close();
ApplicationId applicationId = OrchestratorUtil.toApplicationId(TestIds.APPLICATION_INSTANCE_REFERENCE);
assertEquals("test-tenant:test-application:test-environment:test-region:test-instance-key",
diff --git a/service-monitor/src/main/java/com/yahoo/vespa/service/duper/CriticalRegionChecker.java b/service-monitor/src/main/java/com/yahoo/vespa/service/duper/CriticalRegionChecker.java
new file mode 100644
index 00000000000..238b70a7310
--- /dev/null
+++ b/service-monitor/src/main/java/com/yahoo/vespa/service/duper/CriticalRegionChecker.java
@@ -0,0 +1,63 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.service.duper;
+
+import com.yahoo.vespa.service.monitor.CriticalRegion;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * To detect and throw an {@link IllegalStateException} if the execution of the current thread has
+ * reached code point P, some time after the start of a critical region R and before the end of it:
+ *
+ * <ol>
+ * <li>Declare a static final instance of {@link CriticalRegionChecker}.</li>
+ * <li>Invoke {@link #startCriticalRegion(String)} when entering region R, and close
+ * the returned {@link CriticalRegion} when leaving it.</li>
+ * <li>Invoke {@link #assertOutsideCriticalRegions(String)} at code point P.</li>
+ * </ol>
+ *
+ * @author hakonhall
+ */
+public class CriticalRegionChecker {
+
+ private final ThreadLocalDescriptions threadLocalDescriptions = new ThreadLocalDescriptions();
+ private final String name;
+
+ public CriticalRegionChecker(String name) {
+ this.name = name;
+ }
+
+ /** Start of the critical region, within which {@link #assertOutsideCriticalRegions(String)} will throw. */
+ public CriticalRegion startCriticalRegion(String regionDescription) {
+ List<String> regionDescriptions = threadLocalDescriptions.get();
+ regionDescriptions.add(regionDescription);
+ Thread threadAtStart = Thread.currentThread();
+
+ return () -> {
+ regionDescriptions.remove(regionDescription);
+
+ Thread treadAtClose = Thread.currentThread();
+ if (threadAtStart != treadAtClose) {
+ throw new IllegalStateException(name + ": A critical region cannot cross threads: " +
+ regionDescription);
+ }
+ };
+ }
+
+ /** @throws IllegalStateException if within one or more critical regions. */
+ public void assertOutsideCriticalRegions(String codePointDescription) throws IllegalStateException {
+ List<String> regionDescriptions = threadLocalDescriptions.get();
+ if (regionDescriptions.size() > 0) {
+ throw new IllegalStateException(name + ": Code point " + codePointDescription +
+ " is within these critical regions: " + regionDescriptions);
+ }
+ }
+
+ private static class ThreadLocalDescriptions extends ThreadLocal<List<String>> {
+ @Override
+ protected List<String> initialValue() {
+ return new ArrayList<>();
+ }
+ }
+}
diff --git a/service-monitor/src/main/java/com/yahoo/vespa/service/duper/DuperModelManager.java b/service-monitor/src/main/java/com/yahoo/vespa/service/duper/DuperModelManager.java
index baada59dc65..cb1c6d2d95f 100644
--- a/service-monitor/src/main/java/com/yahoo/vespa/service/duper/DuperModelManager.java
+++ b/service-monitor/src/main/java/com/yahoo/vespa/service/duper/DuperModelManager.java
@@ -12,6 +12,7 @@ import com.yahoo.config.provision.HostName;
import com.yahoo.config.provision.SystemName;
import com.yahoo.log.LogLevel;
import com.yahoo.vespa.flags.FlagSource;
+import com.yahoo.vespa.service.monitor.CriticalRegion;
import com.yahoo.vespa.service.monitor.DuperModelInfraApi;
import com.yahoo.vespa.service.monitor.DuperModelListener;
import com.yahoo.vespa.service.monitor.DuperModelProvider;
@@ -23,8 +24,10 @@ import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
+import java.util.concurrent.locks.ReentrantLock;
import java.util.function.Function;
import java.util.logging.Logger;
+import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@@ -46,7 +49,10 @@ public class DuperModelManager implements DuperModelProvider, DuperModelInfraApi
private final Map<ApplicationId, InfraApplication> supportedInfraApplications;
- private final Object monitor = new Object();
+ private static CriticalRegionChecker disallowedDuperModeLockAcquisitionRegions =
+ new CriticalRegionChecker("duper model deadlock detection");
+
+ private final ReentrantLock lock = new ReentrantLock(true);
private final DuperModel duperModel;
// The set of active infrastructure ApplicationInfo. Not all are necessarily in the DuperModel for historical reasons.
private final Set<ApplicationId> activeInfraInfos = new HashSet<>(10);
@@ -83,27 +89,23 @@ public class DuperModelManager implements DuperModelProvider, DuperModelInfraApi
superModelProvider.registerListener(new SuperModelListener() {
@Override
public void applicationActivated(SuperModel superModel, ApplicationInfo application) {
- synchronized (monitor) {
- duperModel.add(application);
- }
+ lockedRunnable(() -> duperModel.add(application));
}
@Override
public void applicationRemoved(SuperModel superModel, ApplicationId applicationId) {
- synchronized (monitor) {
- duperModel.remove(applicationId);
- }
+ lockedRunnable(() -> duperModel.remove(applicationId));
}
@Override
public void notifyOfCompleteness(SuperModel superModel) {
- synchronized (monitor) {
+ lockedRunnable(() -> {
if (!superModelIsComplete) {
superModelIsComplete = true;
logger.log(LogLevel.INFO, "All bootstrap tenant applications have been activated");
maybeSetDuperModelAsComplete();
}
- }
+ });
}
});
}
@@ -114,9 +116,7 @@ public class DuperModelManager implements DuperModelProvider, DuperModelInfraApi
*/
@Override
public void registerListener(DuperModelListener listener) {
- synchronized (monitor) {
- duperModel.registerListener(listener);
- }
+ lockedRunnable(() -> duperModel.registerListener(listener));
}
@Override
@@ -138,9 +138,7 @@ public class DuperModelManager implements DuperModelProvider, DuperModelInfraApi
@Override
public boolean infraApplicationIsActive(ApplicationId applicationId) {
- synchronized (monitor) {
- return activeInfraInfos.contains(applicationId);
- }
+ return lockedSupplier(() -> activeInfraInfos.contains(applicationId));
}
@Override
@@ -150,10 +148,10 @@ public class DuperModelManager implements DuperModelProvider, DuperModelInfraApi
throw new IllegalArgumentException("There is no infrastructure application with ID '" + applicationId + "'");
}
- synchronized (monitor) {
+ lockedRunnable(() -> {
activeInfraInfos.add(applicationId);
duperModel.add(application.makeApplicationInfo(hostnames));
- }
+ });
}
@Override
@@ -162,39 +160,41 @@ public class DuperModelManager implements DuperModelProvider, DuperModelInfraApi
throw new IllegalArgumentException("There is no infrastructure application with ID '" + applicationId + "'");
}
- synchronized (monitor) {
+ lockedRunnable(() -> {
activeInfraInfos.remove(applicationId);
duperModel.remove(applicationId);
- }
+ });
}
@Override
public void infraApplicationsIsNowComplete() {
- synchronized (monitor) {
+ lockedRunnable(() -> {
if (!infraApplicationsIsComplete) {
infraApplicationsIsComplete = true;
logger.log(LogLevel.INFO, "All infrastructure applications have been activated");
maybeSetDuperModelAsComplete();
}
- }
+ });
}
public Optional<ApplicationInfo> getApplicationInfo(ApplicationId applicationId) {
- synchronized (monitor) {
- return duperModel.getApplicationInfo(applicationId);
- }
+ return lockedSupplier(() -> duperModel.getApplicationInfo(applicationId));
}
public Optional<ApplicationInfo> getApplicationInfo(HostName hostname) {
- synchronized (monitor) {
- return duperModel.getApplicationInfo(hostname);
- }
+ return lockedSupplier(() -> duperModel.getApplicationInfo(hostname));
}
public List<ApplicationInfo> getApplicationInfos() {
- synchronized (monitor) {
- return duperModel.getApplicationInfos();
- }
+ return lockedSupplier(() -> duperModel.getApplicationInfos());
+ }
+
+ /**
+ * Within the region, trying to accesss the duper model (without already having the lock)
+ * will cause an IllegalStateException to be thrown.
+ */
+ public CriticalRegion disallowDuperModelLockAcquisition(String regionDescription) {
+ return disallowedDuperModeLockAcquisitionRegions.startCriticalRegion(regionDescription);
}
private void maybeSetDuperModelAsComplete() {
@@ -202,4 +202,36 @@ public class DuperModelManager implements DuperModelProvider, DuperModelInfraApi
duperModel.setComplete();
}
}
+
+ private <T> T lockedSupplier(Supplier<T> supplier) {
+ if (lock.isHeldByCurrentThread()) {
+ return supplier.get();
+ }
+
+ disallowedDuperModeLockAcquisitionRegions
+ .assertOutsideCriticalRegions("acquiring duper model lock");
+
+ lock.lock();
+ try {
+ return supplier.get();
+ } finally {
+ lock.unlock();
+ }
+ }
+
+ private void lockedRunnable(Runnable runnable) {
+ if (lock.isHeldByCurrentThread()) {
+ runnable.run();
+ }
+
+ disallowedDuperModeLockAcquisitionRegions
+ .assertOutsideCriticalRegions("acquiring duper model lock");
+
+ lock.lock();
+ try {
+ runnable.run();
+ } finally {
+ lock.unlock();
+ }
+ }
}
diff --git a/service-monitor/src/main/java/com/yahoo/vespa/service/model/ServiceMonitorImpl.java b/service-monitor/src/main/java/com/yahoo/vespa/service/model/ServiceMonitorImpl.java
index 7c0f291e121..678b0d3d5ba 100644
--- a/service-monitor/src/main/java/com/yahoo/vespa/service/model/ServiceMonitorImpl.java
+++ b/service-monitor/src/main/java/com/yahoo/vespa/service/model/ServiceMonitorImpl.java
@@ -14,6 +14,8 @@ import com.yahoo.vespa.applicationmodel.ServiceInstance;
import com.yahoo.vespa.service.duper.DuperModelManager;
import com.yahoo.vespa.service.manager.MonitorManager;
import com.yahoo.vespa.service.manager.UnionMonitorManager;
+import com.yahoo.vespa.service.monitor.AntiServiceMonitor;
+import com.yahoo.vespa.service.monitor.CriticalRegion;
import com.yahoo.vespa.service.monitor.ServiceHostListener;
import com.yahoo.vespa.service.monitor.ServiceModel;
import com.yahoo.vespa.service.monitor.ServiceMonitor;
@@ -24,7 +26,7 @@ import java.util.Map;
import java.util.Optional;
import java.util.Set;
-public class ServiceMonitorImpl implements ServiceMonitor {
+public class ServiceMonitorImpl implements ServiceMonitor, AntiServiceMonitor {
private final ServiceMonitorMetrics metrics;
private final DuperModelManager duperModelManager;
@@ -105,6 +107,11 @@ public class ServiceMonitorImpl implements ServiceMonitor {
duperModelManager.registerListener(duperModelListener);
}
+ @Override
+ public CriticalRegion disallowDuperModelLockAcquisition(String regionDescription) {
+ return duperModelManager.disallowDuperModelLockAcquisition(regionDescription);
+ }
+
private Optional<ApplicationInfo> getApplicationInfo(ApplicationInstanceReference reference) {
ApplicationId applicationId = ApplicationInstanceGenerator.toApplicationId(reference);
return duperModelManager.getApplicationInfo(applicationId);
diff --git a/service-monitor/src/main/java/com/yahoo/vespa/service/monitor/AntiServiceMonitor.java b/service-monitor/src/main/java/com/yahoo/vespa/service/monitor/AntiServiceMonitor.java
new file mode 100644
index 00000000000..1c0460c3797
--- /dev/null
+++ b/service-monitor/src/main/java/com/yahoo/vespa/service/monitor/AntiServiceMonitor.java
@@ -0,0 +1,25 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.service.monitor;
+
+import com.yahoo.vespa.service.duper.DuperModelManager;
+
+/**
+ * Interface that allows a thread to declare it must NOT access certain parts of the ServiceMonitor.
+ *
+ * @author hakonhall
+ */
+public interface AntiServiceMonitor {
+ /**
+ * Disallow the current thread to acquire the "duper model lock" (see {@link DuperModelManager}),
+ * necessarily acquired by most of the {@link ServiceMonitor} methods, starting from now and
+ * up until the returned region is closed.
+ *
+ * <p>For instance, if an application is activated the duper model is notified: The duper model
+ * will acquire a lock to update the model atomically, and while having that lock notify
+ * the status service. The status service will typically acquire an application lock and prune
+ * hosts no longer part of the application. If a thread were to try to acquire these locks
+ * in the reverse order, it might become deadlocked. This method allows one to detect this
+ * and throw an exception, causing it to be caught earlier or at least easier to debug.</p>
+ */
+ CriticalRegion disallowDuperModelLockAcquisition(String regionDescription);
+}
diff --git a/service-monitor/src/main/java/com/yahoo/vespa/service/monitor/CriticalRegion.java b/service-monitor/src/main/java/com/yahoo/vespa/service/monitor/CriticalRegion.java
new file mode 100644
index 00000000000..ec674f133a1
--- /dev/null
+++ b/service-monitor/src/main/java/com/yahoo/vespa/service/monitor/CriticalRegion.java
@@ -0,0 +1,18 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.service.monitor;
+
+/**
+ * Represents a region of execution, e.g. the block of a try-with-resources.
+ *
+ * @author hakonhall
+ */
+@FunctionalInterface
+public interface CriticalRegion extends AutoCloseable {
+ /**
+ * Ends the critical region.
+ *
+ * @throws IllegalStateException if the current thread is different from the one that started
+ * the critical region.
+ */
+ @Override void close() throws IllegalStateException;
+}