diff options
author | HÃ¥kon Hallingstad <hakon@verizonmedia.com> | 2020-03-07 17:00:19 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-03-07 17:00:19 +0100 |
commit | 3eb6aa9161ca016d8db3914970054d5519052c45 (patch) | |
tree | 49ffc47b8092dc81508da8b40ec7ff88ac64b275 | |
parent | 71cf26a0abe0b62b9d3038c86351e69a3dee640e (diff) | |
parent | 92a2c10c684ee128b25e2128433451db438fc14a (diff) |
Merge pull request #12496 from vespa-engine/hakonhall/throw-if-accessing-duper-model-while-holding-status-service-application-lock
Throw if accessing duper model while holding status service application lock
13 files changed, 289 insertions, 52 deletions
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZkStatusService.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZkStatusService.java index 9874903e976..3bf0d886a41 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZkStatusService.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZkStatusService.java @@ -16,6 +16,8 @@ import com.yahoo.vespa.flags.FlagSource; import com.yahoo.vespa.flags.Flags; import com.yahoo.vespa.orchestrator.OrchestratorContext; import com.yahoo.vespa.orchestrator.OrchestratorUtil; +import com.yahoo.vespa.service.monitor.AntiServiceMonitor; +import com.yahoo.vespa.service.monitor.CriticalRegion; import org.apache.zookeeper.data.Stat; import javax.inject.Inject; @@ -46,6 +48,7 @@ public class ZkStatusService implements StatusService { private final Metric metric; private final Timer timer; private final boolean doCleanup; + private final AntiServiceMonitor antiServiceMonitor; /** * A cache of metric contexts for each possible dimension map. In practice, there is one dimension map @@ -58,21 +61,25 @@ public class ZkStatusService implements StatusService { @Component Curator curator, @Component Metric metric, @Component Timer timer, - @Component FlagSource flagSource) { + @Component FlagSource flagSource, + @Component AntiServiceMonitor antiServiceMonitor) { this(curator, metric, timer, new HostInfosCache(curator, new HostInfosServiceImpl(curator, timer)), - Flags.CLEANUP_STATUS_SERVICE.bindTo(flagSource).value()); + Flags.CLEANUP_STATUS_SERVICE.bindTo(flagSource).value(), + antiServiceMonitor); } /** Non-private for testing only. */ - ZkStatusService(Curator curator, Metric metric, Timer timer, HostInfosCache hostInfosCache, boolean doCleanup) { + ZkStatusService(Curator curator, Metric metric, Timer timer, HostInfosCache hostInfosCache, + boolean doCleanup, AntiServiceMonitor antiServiceMonitor) { this.curator = curator; this.metric = metric; this.timer = timer; this.hostInfosCache = hostInfosCache; this.doCleanup = doCleanup; + this.antiServiceMonitor = antiServiceMonitor; } @Override @@ -186,6 +193,9 @@ public class ZkStatusService implements StatusService { metric.add(acquireResultMetricName, 1, metricContext); } + CriticalRegion inaccessibleDuperModelRegion = antiServiceMonitor + .disallowDuperModelLockAcquisition(ZkStatusService.class.getSimpleName() + " application lock"); + return () -> { try { lock.close(); @@ -197,6 +207,8 @@ public class ZkStatusService implements StatusService { e); } + inaccessibleDuperModelRegion.close(); + Instant lockReleasedTime = timer.currentTime(); double seconds = durationInSeconds(acquireEndTime, lockReleasedTime); metric.set("orchestrator.lock.hold-latency", seconds, metricContext); diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyAntiServiceMonitor.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyAntiServiceMonitor.java new file mode 100644 index 00000000000..d22d99b5c9c --- /dev/null +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyAntiServiceMonitor.java @@ -0,0 +1,15 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.orchestrator; + +import com.yahoo.vespa.service.monitor.AntiServiceMonitor; +import com.yahoo.vespa.service.monitor.CriticalRegion; + +/** + * @author hakonhall + */ +public class DummyAntiServiceMonitor implements AntiServiceMonitor { + @Override + public CriticalRegion disallowDuperModelLockAcquisition(String regionDescription) { + return () -> {}; + } +} diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyServiceMonitor.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyServiceMonitor.java index 0da4c48d408..a784187fb62 100644 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyServiceMonitor.java +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyServiceMonitor.java @@ -15,6 +15,8 @@ import com.yahoo.vespa.applicationmodel.ServiceType; import com.yahoo.vespa.applicationmodel.TenantId; import com.yahoo.vespa.orchestrator.model.NodeGroup; import com.yahoo.vespa.orchestrator.model.VespaModelUtil; +import com.yahoo.vespa.service.monitor.AntiServiceMonitor; +import com.yahoo.vespa.service.monitor.CriticalRegion; import com.yahoo.vespa.service.monitor.ServiceModel; import com.yahoo.vespa.service.monitor.ServiceMonitor; @@ -31,7 +33,7 @@ import java.util.stream.Collectors; * @author oyving * @author smorgrav */ -public class DummyServiceMonitor implements ServiceMonitor { +public class DummyServiceMonitor implements ServiceMonitor, AntiServiceMonitor { public static final HostName TEST1_HOST_NAME = new HostName("test1.hostname.tld"); public static final HostName TEST3_HOST_NAME = new HostName("test3.hostname.tld"); @@ -165,6 +167,11 @@ public class DummyServiceMonitor implements ServiceMonitor { throw new UnsupportedOperationException(); } + @Override + public CriticalRegion disallowDuperModelLockAcquisition(String regionDescription) { + return () -> {}; + } + public static Set<HostName> getContentHosts(ApplicationInstanceReference appRef) { Set<HostName> hosts = apps.stream() .filter(application -> application.reference().equals(appRef)) diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorImplTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorImplTest.java index 44e11fb794b..f49dc490fe6 100644 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorImplTest.java +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorImplTest.java @@ -76,7 +76,12 @@ public class OrchestratorImplTest { private final ApplicationApiFactory applicationApiFactory = new ApplicationApiFactory(3); private final InMemoryFlagSource flagSource = new InMemoryFlagSource(); private final MockCurator curator = new MockCurator(); - private ZkStatusService statusService = new ZkStatusService(curator, mock(Metric.class), new TestTimer(), flagSource); + private ZkStatusService statusService = new ZkStatusService( + curator, + mock(Metric.class), + new TestTimer(), + flagSource, + new DummyAntiServiceMonitor()); private ApplicationId app1; private ApplicationId app2; @@ -394,7 +399,12 @@ public class OrchestratorImplTest { @Test public void testGetHost() throws Exception { ClusterControllerClientFactory clusterControllerClientFactory = new ClusterControllerClientFactoryMock(); - StatusService statusService = new ZkStatusService(new MockCurator(), mock(Metric.class), new TestTimer(), flagSource); + StatusService statusService = new ZkStatusService( + new MockCurator(), + mock(Metric.class), + new TestTimer(), + flagSource, + new DummyAntiServiceMonitor()); HostName hostName = new HostName("host.yahoo.com"); TenantId tenantId = new TenantId("tenant"); diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ModelTestUtils.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ModelTestUtils.java index e6a14f75b24..88e9d1c90fe 100644 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ModelTestUtils.java +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ModelTestUtils.java @@ -17,6 +17,7 @@ import com.yahoo.vespa.applicationmodel.ServiceType; import com.yahoo.vespa.applicationmodel.TenantId; import com.yahoo.vespa.curator.mock.MockCurator; import com.yahoo.vespa.flags.InMemoryFlagSource; +import com.yahoo.vespa.orchestrator.DummyAntiServiceMonitor; import com.yahoo.vespa.orchestrator.OrchestrationException; import com.yahoo.vespa.orchestrator.Orchestrator; import com.yahoo.vespa.orchestrator.OrchestratorContext; @@ -57,9 +58,13 @@ class ModelTestUtils { private final Map<ApplicationInstanceReference, ApplicationInstance> applications = new HashMap<>(); private final ClusterControllerClientFactory clusterControllerClientFactory = new ClusterControllerClientFactoryMock(); private final Map<HostName, HostStatus> hostStatusMap = new HashMap<>(); - private final StatusService statusService = new ZkStatusService(new MockCurator(), mock(Metric.class), new TestTimer(), flagSource); - private final TestTimer timer = new TestTimer(); private final ServiceMonitor serviceMonitor = () -> new ServiceModel(applications); + private final StatusService statusService = new ZkStatusService( + new MockCurator(), + mock(Metric.class), + new TestTimer(), + flagSource, + new DummyAntiServiceMonitor()); private final Orchestrator orchestrator = new OrchestratorImpl(new HostedVespaPolicy(new HostedVespaClusterPolicy(), clusterControllerClientFactory, applicationApiFactory()), clusterControllerClientFactory, statusService, diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/resources/HostResourceTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/resources/HostResourceTest.java index 1bf3b267a05..ffb79b4edf6 100644 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/resources/HostResourceTest.java +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/resources/HostResourceTest.java @@ -19,6 +19,7 @@ import com.yahoo.vespa.curator.mock.MockCurator; import com.yahoo.vespa.flags.InMemoryFlagSource; import com.yahoo.vespa.orchestrator.BatchHostNameNotFoundException; import com.yahoo.vespa.orchestrator.BatchInternalErrorException; +import com.yahoo.vespa.orchestrator.DummyAntiServiceMonitor; import com.yahoo.vespa.orchestrator.Host; import com.yahoo.vespa.orchestrator.HostNameNotFoundException; import com.yahoo.vespa.orchestrator.OrchestrationException; @@ -79,11 +80,12 @@ public class HostResourceTest { private static final int SERVICE_MONITOR_CONVERGENCE_LATENCY_SECONDS = 0; private static final TenantId TENANT_ID = new TenantId("tenantId"); private static final ApplicationInstanceId APPLICATION_INSTANCE_ID = new ApplicationInstanceId("applicationId"); + private static final ServiceMonitor serviceMonitor = mock(ServiceMonitor.class); private static final StatusService EVERY_HOST_IS_UP_HOST_STATUS_SERVICE = new ZkStatusService( - new MockCurator(), mock(Metric.class), new TestTimer(), new InMemoryFlagSource()); + new MockCurator(), mock(Metric.class), new TestTimer(), new InMemoryFlagSource(), + new DummyAntiServiceMonitor()); private static final ApplicationApiFactory applicationApiFactory = new ApplicationApiFactory(3); - private static final ServiceMonitor serviceMonitor = mock(ServiceMonitor.class); static { when(serviceMonitor.getApplication(any(HostName.class))) .thenReturn(Optional.of( diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusService2Test.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusService2Test.java index 4c01572f787..e8cfe15611e 100644 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusService2Test.java +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusService2Test.java @@ -9,6 +9,8 @@ import com.yahoo.vespa.applicationmodel.ApplicationInstanceReference; import com.yahoo.vespa.applicationmodel.TenantId; import com.yahoo.vespa.curator.Curator; import com.yahoo.vespa.orchestrator.OrchestratorContext; +import com.yahoo.vespa.service.monitor.AntiServiceMonitor; +import com.yahoo.vespa.service.monitor.CriticalRegion; import org.apache.curator.framework.recipes.locks.InterProcessMutex; import org.junit.Test; import org.mockito.ArgumentCaptor; @@ -32,7 +34,10 @@ public class ZkStatusService2Test { private final Timer timer = new TestTimer(); private final Metric metric = mock(Metric.class); private final HostInfosCache cache = mock(HostInfosCache.class); - private final ZkStatusService zkStatusService = new ZkStatusService(curator, metric, timer, cache, false); + private final CriticalRegion criticalRegion = mock(CriticalRegion.class); + private final AntiServiceMonitor antiServiceMonitor = mock(AntiServiceMonitor.class); + private final ZkStatusService zkStatusService = + new ZkStatusService(curator, metric, timer, cache, false, antiServiceMonitor); private final OrchestratorContext context = mock(OrchestratorContext.class); private final InterProcessMutex mutex = mock(InterProcessMutex.class); @@ -47,34 +52,43 @@ public class ZkStatusService2Test { when(curator.createMutex(any())).thenReturn(mutex); when(mutex.acquire(anyLong(), any())).thenReturn(true); + when(antiServiceMonitor.disallowDuperModelLockAcquisition(any())).thenReturn(criticalRegion); when(context.getTimeLeft()).thenReturn(Duration.ofSeconds(12)); + verify(antiServiceMonitor, times(0)).disallowDuperModelLockAcquisition(any()); try (ApplicationLock lock = zkStatusService.lockApplication(context, reference)) { - // nothing + verify(antiServiceMonitor, times(1)).disallowDuperModelLockAcquisition(any()); + + verify(criticalRegion, times(0)).close(); } + verify(criticalRegion, times(1)).close(); verify(curator, times(1)).createMutex(any()); verify(mutex, times(1)).acquire(anyLong(), any()); verify(mutex, times(1)).release(); verify(context, times(1)).hasLock(any()); verify(context, times(1)).registerLockAcquisition(any(), any()); - verifyNoMoreInteractions(mutex); + verifyNoMoreInteractions(mutex, antiServiceMonitor, criticalRegion); // Now the non-probe suspension when(context.isProbe()).thenReturn(false); + verify(antiServiceMonitor, times(1)).disallowDuperModelLockAcquisition(any()); try (ApplicationLock lock = zkStatusService.lockApplication(context, reference)) { - // nothing + verify(antiServiceMonitor, times(2)).disallowDuperModelLockAcquisition(any()); + + verify(criticalRegion, times(1)).close(); } + verify(criticalRegion, times(2)).close(); verify(mutex, times(2)).acquire(anyLong(), any()); verify(mutex, times(2)).release(); ArgumentCaptor<Runnable> runnableCaptor = ArgumentCaptor.forClass(Runnable.class); verify(context, times(2)).hasLock(any()); verify(context, times(2)).registerLockAcquisition(any(), any()); - verifyNoMoreInteractions(mutex); + verifyNoMoreInteractions(mutex, antiServiceMonitor, criticalRegion); } @Test @@ -85,6 +99,7 @@ public class ZkStatusService2Test { when(curator.createMutex(any())).thenReturn(mutex); when(mutex.acquire(anyLong(), any())).thenReturn(true); + when(antiServiceMonitor.disallowDuperModelLockAcquisition(any())).thenReturn(criticalRegion); when(context.getTimeLeft()).thenReturn(Duration.ofSeconds(12)); @@ -97,7 +112,9 @@ public class ZkStatusService2Test { verify(mutex, times(0)).release(); verify(context, times(1)).hasLock(any()); verify(context, times(1)).registerLockAcquisition(any(), any()); - verifyNoMoreInteractions(mutex); + verify(antiServiceMonitor, times(1)).disallowDuperModelLockAcquisition(any()); + verify(criticalRegion, times(0)).close(); + verifyNoMoreInteractions(mutex, antiServiceMonitor, criticalRegion); // Now the non-probe suspension @@ -114,7 +131,9 @@ public class ZkStatusService2Test { verify(mutex, times(0)).release(); verify(context, times(2)).hasLock(any()); verify(context, times(1)).registerLockAcquisition(any(), any()); - verifyNoMoreInteractions(mutex); + verify(antiServiceMonitor, times(1)).disallowDuperModelLockAcquisition(any()); + verify(criticalRegion, times(0)).close(); + verifyNoMoreInteractions(mutex, antiServiceMonitor, criticalRegion); // Verify the context runnable releases the mutex ArgumentCaptor<Runnable> runnableCaptor = ArgumentCaptor.forClass(Runnable.class); @@ -123,5 +142,7 @@ public class ZkStatusService2Test { runnableCaptor.getAllValues().forEach(Runnable::run); verify(mutex, times(1)).acquire(anyLong(), any()); verify(mutex, times(1)).release(); + verify(criticalRegion, times(1)).close(); + verifyNoMoreInteractions(mutex, antiServiceMonitor, criticalRegion); } }
\ No newline at end of file diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusServiceTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusServiceTest.java index 3c3c337f4e1..9fe910079cf 100644 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusServiceTest.java +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusServiceTest.java @@ -16,6 +16,9 @@ import com.yahoo.vespa.flags.InMemoryFlagSource; import com.yahoo.vespa.orchestrator.OrchestratorContext; import com.yahoo.vespa.orchestrator.OrchestratorUtil; import com.yahoo.vespa.orchestrator.TestIds; +import com.yahoo.vespa.service.monitor.AntiServiceMonitor; +import com.yahoo.vespa.service.monitor.CriticalRegion; +import com.yahoo.vespa.service.monitor.ServiceMonitor; import com.yahoo.yolean.Exceptions; import org.apache.curator.framework.CuratorFramework; import org.apache.curator.test.KillSession; @@ -57,6 +60,7 @@ import static org.junit.Assert.fail; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; @@ -68,7 +72,10 @@ public class ZkStatusServiceTest { private final Timer timer = mock(Timer.class); private final Metric metric = mock(Metric.class); private final OrchestratorContext context = mock(OrchestratorContext.class); - private InMemoryFlagSource flagSource = new InMemoryFlagSource(); + private final InMemoryFlagSource flagSource = new InMemoryFlagSource(); + private final ServiceMonitor serviceMonitor = mock(ServiceMonitor.class); + private final CriticalRegion criticalRegion = mock(CriticalRegion.class); + private final AntiServiceMonitor antiServiceMonitor = mock(AntiServiceMonitor.class); @Captor private ArgumentCaptor<Map<String, String>> captor; @@ -78,11 +85,12 @@ public class ZkStatusServiceTest { testingServer = new TestingServer(); curator = createConnectedCurator(testingServer); - statusService = new ZkStatusService(curator, metric, timer, flagSource); + statusService = new ZkStatusService(curator, metric, timer, flagSource, antiServiceMonitor); when(context.getTimeLeft()).thenReturn(Duration.ofSeconds(10)); when(context.isProbe()).thenReturn(false); when(timer.currentTime()).thenReturn(Instant.ofEpochMilli(1)); when(timer.toUtcClock()).thenReturn(new ManualClock(Instant.ofEpochMilli(1))); + when(antiServiceMonitor.disallowDuperModelLockAcquisition(any())).thenReturn(criticalRegion); } private static Curator createConnectedCurator(TestingServer server) throws InterruptedException { @@ -151,7 +159,7 @@ public class ZkStatusServiceTest { public void locks_are_exclusive() throws Exception { setUp(); ZkStatusService zkStatusService2 = - new ZkStatusService(curator, mock(Metric.class), new TestTimer(), flagSource); + new ZkStatusService(curator, mock(Metric.class), new TestTimer(), flagSource, antiServiceMonitor); final CompletableFuture<Void> lockedSuccessfullyFuture; try (ApplicationLock lock = statusService @@ -178,7 +186,7 @@ public class ZkStatusServiceTest { public void failing_to_get_lock_closes_SessionFailRetryLoop() throws Exception { setUp(); ZkStatusService zkStatusService2 = - new ZkStatusService(curator, mock(Metric.class), new TestTimer(), flagSource); + new ZkStatusService(curator, mock(Metric.class), new TestTimer(), flagSource, antiServiceMonitor); try (ApplicationLock lock = statusService .lockApplication(context, TestIds.APPLICATION_INSTANCE_REFERENCE)) { @@ -309,16 +317,28 @@ public class ZkStatusServiceTest { HostName strayHostname = new HostName("stray1.com"); + verify(antiServiceMonitor, times(0)).disallowDuperModelLockAcquisition(any()); try (ApplicationLock lock = statusService.lockApplication(context, TestIds.APPLICATION_INSTANCE_REFERENCE)) { + verify(antiServiceMonitor, times(1)).disallowDuperModelLockAcquisition(any()); + lock.setApplicationInstanceStatus(ApplicationInstanceStatus.ALLOWED_TO_BE_DOWN); lock.setHostState(TestIds.HOST_NAME1, HostStatus.ALLOWED_TO_BE_DOWN); lock.setHostState(strayHostname, HostStatus.PERMANENTLY_DOWN); + + verify(criticalRegion, times(0)).close(); } + verify(criticalRegion, times(1)).close(); + verify(antiServiceMonitor, times(1)).disallowDuperModelLockAcquisition(any()); try (ApplicationLock lock = statusService.lockApplication(context, TestIds.APPLICATION_INSTANCE_REFERENCE2)) { + verify(antiServiceMonitor, times(2)).disallowDuperModelLockAcquisition(any()); + lock.setApplicationInstanceStatus(ApplicationInstanceStatus.ALLOWED_TO_BE_DOWN); + + verify(criticalRegion, times(1)).close(); } + verify(criticalRegion, times(2)).close(); ApplicationId applicationId = OrchestratorUtil.toApplicationId(TestIds.APPLICATION_INSTANCE_REFERENCE); assertEquals("test-tenant:test-application:test-environment:test-region:test-instance-key", diff --git a/service-monitor/src/main/java/com/yahoo/vespa/service/duper/CriticalRegionChecker.java b/service-monitor/src/main/java/com/yahoo/vespa/service/duper/CriticalRegionChecker.java new file mode 100644 index 00000000000..238b70a7310 --- /dev/null +++ b/service-monitor/src/main/java/com/yahoo/vespa/service/duper/CriticalRegionChecker.java @@ -0,0 +1,63 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.service.duper; + +import com.yahoo.vespa.service.monitor.CriticalRegion; + +import java.util.ArrayList; +import java.util.List; + +/** + * To detect and throw an {@link IllegalStateException} if the execution of the current thread has + * reached code point P, some time after the start of a critical region R and before the end of it: + * + * <ol> + * <li>Declare a static final instance of {@link CriticalRegionChecker}.</li> + * <li>Invoke {@link #startCriticalRegion(String)} when entering region R, and close + * the returned {@link CriticalRegion} when leaving it.</li> + * <li>Invoke {@link #assertOutsideCriticalRegions(String)} at code point P.</li> + * </ol> + * + * @author hakonhall + */ +public class CriticalRegionChecker { + + private final ThreadLocalDescriptions threadLocalDescriptions = new ThreadLocalDescriptions(); + private final String name; + + public CriticalRegionChecker(String name) { + this.name = name; + } + + /** Start of the critical region, within which {@link #assertOutsideCriticalRegions(String)} will throw. */ + public CriticalRegion startCriticalRegion(String regionDescription) { + List<String> regionDescriptions = threadLocalDescriptions.get(); + regionDescriptions.add(regionDescription); + Thread threadAtStart = Thread.currentThread(); + + return () -> { + regionDescriptions.remove(regionDescription); + + Thread treadAtClose = Thread.currentThread(); + if (threadAtStart != treadAtClose) { + throw new IllegalStateException(name + ": A critical region cannot cross threads: " + + regionDescription); + } + }; + } + + /** @throws IllegalStateException if within one or more critical regions. */ + public void assertOutsideCriticalRegions(String codePointDescription) throws IllegalStateException { + List<String> regionDescriptions = threadLocalDescriptions.get(); + if (regionDescriptions.size() > 0) { + throw new IllegalStateException(name + ": Code point " + codePointDescription + + " is within these critical regions: " + regionDescriptions); + } + } + + private static class ThreadLocalDescriptions extends ThreadLocal<List<String>> { + @Override + protected List<String> initialValue() { + return new ArrayList<>(); + } + } +} diff --git a/service-monitor/src/main/java/com/yahoo/vespa/service/duper/DuperModelManager.java b/service-monitor/src/main/java/com/yahoo/vespa/service/duper/DuperModelManager.java index baada59dc65..cb1c6d2d95f 100644 --- a/service-monitor/src/main/java/com/yahoo/vespa/service/duper/DuperModelManager.java +++ b/service-monitor/src/main/java/com/yahoo/vespa/service/duper/DuperModelManager.java @@ -12,6 +12,7 @@ import com.yahoo.config.provision.HostName; import com.yahoo.config.provision.SystemName; import com.yahoo.log.LogLevel; import com.yahoo.vespa.flags.FlagSource; +import com.yahoo.vespa.service.monitor.CriticalRegion; import com.yahoo.vespa.service.monitor.DuperModelInfraApi; import com.yahoo.vespa.service.monitor.DuperModelListener; import com.yahoo.vespa.service.monitor.DuperModelProvider; @@ -23,8 +24,10 @@ import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; +import java.util.concurrent.locks.ReentrantLock; import java.util.function.Function; import java.util.logging.Logger; +import java.util.function.Supplier; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -46,7 +49,10 @@ public class DuperModelManager implements DuperModelProvider, DuperModelInfraApi private final Map<ApplicationId, InfraApplication> supportedInfraApplications; - private final Object monitor = new Object(); + private static CriticalRegionChecker disallowedDuperModeLockAcquisitionRegions = + new CriticalRegionChecker("duper model deadlock detection"); + + private final ReentrantLock lock = new ReentrantLock(true); private final DuperModel duperModel; // The set of active infrastructure ApplicationInfo. Not all are necessarily in the DuperModel for historical reasons. private final Set<ApplicationId> activeInfraInfos = new HashSet<>(10); @@ -83,27 +89,23 @@ public class DuperModelManager implements DuperModelProvider, DuperModelInfraApi superModelProvider.registerListener(new SuperModelListener() { @Override public void applicationActivated(SuperModel superModel, ApplicationInfo application) { - synchronized (monitor) { - duperModel.add(application); - } + lockedRunnable(() -> duperModel.add(application)); } @Override public void applicationRemoved(SuperModel superModel, ApplicationId applicationId) { - synchronized (monitor) { - duperModel.remove(applicationId); - } + lockedRunnable(() -> duperModel.remove(applicationId)); } @Override public void notifyOfCompleteness(SuperModel superModel) { - synchronized (monitor) { + lockedRunnable(() -> { if (!superModelIsComplete) { superModelIsComplete = true; logger.log(LogLevel.INFO, "All bootstrap tenant applications have been activated"); maybeSetDuperModelAsComplete(); } - } + }); } }); } @@ -114,9 +116,7 @@ public class DuperModelManager implements DuperModelProvider, DuperModelInfraApi */ @Override public void registerListener(DuperModelListener listener) { - synchronized (monitor) { - duperModel.registerListener(listener); - } + lockedRunnable(() -> duperModel.registerListener(listener)); } @Override @@ -138,9 +138,7 @@ public class DuperModelManager implements DuperModelProvider, DuperModelInfraApi @Override public boolean infraApplicationIsActive(ApplicationId applicationId) { - synchronized (monitor) { - return activeInfraInfos.contains(applicationId); - } + return lockedSupplier(() -> activeInfraInfos.contains(applicationId)); } @Override @@ -150,10 +148,10 @@ public class DuperModelManager implements DuperModelProvider, DuperModelInfraApi throw new IllegalArgumentException("There is no infrastructure application with ID '" + applicationId + "'"); } - synchronized (monitor) { + lockedRunnable(() -> { activeInfraInfos.add(applicationId); duperModel.add(application.makeApplicationInfo(hostnames)); - } + }); } @Override @@ -162,39 +160,41 @@ public class DuperModelManager implements DuperModelProvider, DuperModelInfraApi throw new IllegalArgumentException("There is no infrastructure application with ID '" + applicationId + "'"); } - synchronized (monitor) { + lockedRunnable(() -> { activeInfraInfos.remove(applicationId); duperModel.remove(applicationId); - } + }); } @Override public void infraApplicationsIsNowComplete() { - synchronized (monitor) { + lockedRunnable(() -> { if (!infraApplicationsIsComplete) { infraApplicationsIsComplete = true; logger.log(LogLevel.INFO, "All infrastructure applications have been activated"); maybeSetDuperModelAsComplete(); } - } + }); } public Optional<ApplicationInfo> getApplicationInfo(ApplicationId applicationId) { - synchronized (monitor) { - return duperModel.getApplicationInfo(applicationId); - } + return lockedSupplier(() -> duperModel.getApplicationInfo(applicationId)); } public Optional<ApplicationInfo> getApplicationInfo(HostName hostname) { - synchronized (monitor) { - return duperModel.getApplicationInfo(hostname); - } + return lockedSupplier(() -> duperModel.getApplicationInfo(hostname)); } public List<ApplicationInfo> getApplicationInfos() { - synchronized (monitor) { - return duperModel.getApplicationInfos(); - } + return lockedSupplier(() -> duperModel.getApplicationInfos()); + } + + /** + * Within the region, trying to accesss the duper model (without already having the lock) + * will cause an IllegalStateException to be thrown. + */ + public CriticalRegion disallowDuperModelLockAcquisition(String regionDescription) { + return disallowedDuperModeLockAcquisitionRegions.startCriticalRegion(regionDescription); } private void maybeSetDuperModelAsComplete() { @@ -202,4 +202,36 @@ public class DuperModelManager implements DuperModelProvider, DuperModelInfraApi duperModel.setComplete(); } } + + private <T> T lockedSupplier(Supplier<T> supplier) { + if (lock.isHeldByCurrentThread()) { + return supplier.get(); + } + + disallowedDuperModeLockAcquisitionRegions + .assertOutsideCriticalRegions("acquiring duper model lock"); + + lock.lock(); + try { + return supplier.get(); + } finally { + lock.unlock(); + } + } + + private void lockedRunnable(Runnable runnable) { + if (lock.isHeldByCurrentThread()) { + runnable.run(); + } + + disallowedDuperModeLockAcquisitionRegions + .assertOutsideCriticalRegions("acquiring duper model lock"); + + lock.lock(); + try { + runnable.run(); + } finally { + lock.unlock(); + } + } } diff --git a/service-monitor/src/main/java/com/yahoo/vespa/service/model/ServiceMonitorImpl.java b/service-monitor/src/main/java/com/yahoo/vespa/service/model/ServiceMonitorImpl.java index 7c0f291e121..678b0d3d5ba 100644 --- a/service-monitor/src/main/java/com/yahoo/vespa/service/model/ServiceMonitorImpl.java +++ b/service-monitor/src/main/java/com/yahoo/vespa/service/model/ServiceMonitorImpl.java @@ -14,6 +14,8 @@ import com.yahoo.vespa.applicationmodel.ServiceInstance; import com.yahoo.vespa.service.duper.DuperModelManager; import com.yahoo.vespa.service.manager.MonitorManager; import com.yahoo.vespa.service.manager.UnionMonitorManager; +import com.yahoo.vespa.service.monitor.AntiServiceMonitor; +import com.yahoo.vespa.service.monitor.CriticalRegion; import com.yahoo.vespa.service.monitor.ServiceHostListener; import com.yahoo.vespa.service.monitor.ServiceModel; import com.yahoo.vespa.service.monitor.ServiceMonitor; @@ -24,7 +26,7 @@ import java.util.Map; import java.util.Optional; import java.util.Set; -public class ServiceMonitorImpl implements ServiceMonitor { +public class ServiceMonitorImpl implements ServiceMonitor, AntiServiceMonitor { private final ServiceMonitorMetrics metrics; private final DuperModelManager duperModelManager; @@ -105,6 +107,11 @@ public class ServiceMonitorImpl implements ServiceMonitor { duperModelManager.registerListener(duperModelListener); } + @Override + public CriticalRegion disallowDuperModelLockAcquisition(String regionDescription) { + return duperModelManager.disallowDuperModelLockAcquisition(regionDescription); + } + private Optional<ApplicationInfo> getApplicationInfo(ApplicationInstanceReference reference) { ApplicationId applicationId = ApplicationInstanceGenerator.toApplicationId(reference); return duperModelManager.getApplicationInfo(applicationId); diff --git a/service-monitor/src/main/java/com/yahoo/vespa/service/monitor/AntiServiceMonitor.java b/service-monitor/src/main/java/com/yahoo/vespa/service/monitor/AntiServiceMonitor.java new file mode 100644 index 00000000000..1c0460c3797 --- /dev/null +++ b/service-monitor/src/main/java/com/yahoo/vespa/service/monitor/AntiServiceMonitor.java @@ -0,0 +1,25 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.service.monitor; + +import com.yahoo.vespa.service.duper.DuperModelManager; + +/** + * Interface that allows a thread to declare it must NOT access certain parts of the ServiceMonitor. + * + * @author hakonhall + */ +public interface AntiServiceMonitor { + /** + * Disallow the current thread to acquire the "duper model lock" (see {@link DuperModelManager}), + * necessarily acquired by most of the {@link ServiceMonitor} methods, starting from now and + * up until the returned region is closed. + * + * <p>For instance, if an application is activated the duper model is notified: The duper model + * will acquire a lock to update the model atomically, and while having that lock notify + * the status service. The status service will typically acquire an application lock and prune + * hosts no longer part of the application. If a thread were to try to acquire these locks + * in the reverse order, it might become deadlocked. This method allows one to detect this + * and throw an exception, causing it to be caught earlier or at least easier to debug.</p> + */ + CriticalRegion disallowDuperModelLockAcquisition(String regionDescription); +} diff --git a/service-monitor/src/main/java/com/yahoo/vespa/service/monitor/CriticalRegion.java b/service-monitor/src/main/java/com/yahoo/vespa/service/monitor/CriticalRegion.java new file mode 100644 index 00000000000..ec674f133a1 --- /dev/null +++ b/service-monitor/src/main/java/com/yahoo/vespa/service/monitor/CriticalRegion.java @@ -0,0 +1,18 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.service.monitor; + +/** + * Represents a region of execution, e.g. the block of a try-with-resources. + * + * @author hakonhall + */ +@FunctionalInterface +public interface CriticalRegion extends AutoCloseable { + /** + * Ends the critical region. + * + * @throws IllegalStateException if the current thread is different from the one that started + * the critical region. + */ + @Override void close() throws IllegalStateException; +} |