diff options
author | Håkon Hallingstad <hakon@verizonmedia.com> | 2020-03-07 14:58:35 +0100 |
---|---|---|
committer | Håkon Hallingstad <hakon@verizonmedia.com> | 2020-03-07 14:58:35 +0100 |
commit | 92a2c10c684ee128b25e2128433451db438fc14a (patch) | |
tree | 2e154f542cea745afca461898d98af626d1dca68 /orchestrator | |
parent | b241838f43794aa6a84df55a0f6e9e6877060021 (diff) |
Throw if accessing duper model while holding status service application lock
When the duper model is updated, ZooKeeper is atomically updated to e.g. remove
extraneous hosts. This is done by acquiring the duper model lock first, then
the relevant application lock in the status service.
Acquiring these two locks in the reverse order may lead to a deadlock.
This PR throws an IllegalStateException when detecting the current thread is
about to acquire the duper model lock when the current thread has acquired the
application lock.
Diffstat (limited to 'orchestrator')
8 files changed, 113 insertions, 21 deletions
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZkStatusService.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZkStatusService.java index 9874903e976..3bf0d886a41 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZkStatusService.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZkStatusService.java @@ -16,6 +16,8 @@ import com.yahoo.vespa.flags.FlagSource; import com.yahoo.vespa.flags.Flags; import com.yahoo.vespa.orchestrator.OrchestratorContext; import com.yahoo.vespa.orchestrator.OrchestratorUtil; +import com.yahoo.vespa.service.monitor.AntiServiceMonitor; +import com.yahoo.vespa.service.monitor.CriticalRegion; import org.apache.zookeeper.data.Stat; import javax.inject.Inject; @@ -46,6 +48,7 @@ public class ZkStatusService implements StatusService { private final Metric metric; private final Timer timer; private final boolean doCleanup; + private final AntiServiceMonitor antiServiceMonitor; /** * A cache of metric contexts for each possible dimension map. In practice, there is one dimension map @@ -58,21 +61,25 @@ public class ZkStatusService implements StatusService { @Component Curator curator, @Component Metric metric, @Component Timer timer, - @Component FlagSource flagSource) { + @Component FlagSource flagSource, + @Component AntiServiceMonitor antiServiceMonitor) { this(curator, metric, timer, new HostInfosCache(curator, new HostInfosServiceImpl(curator, timer)), - Flags.CLEANUP_STATUS_SERVICE.bindTo(flagSource).value()); + Flags.CLEANUP_STATUS_SERVICE.bindTo(flagSource).value(), + antiServiceMonitor); } /** Non-private for testing only. */ - ZkStatusService(Curator curator, Metric metric, Timer timer, HostInfosCache hostInfosCache, boolean doCleanup) { + ZkStatusService(Curator curator, Metric metric, Timer timer, HostInfosCache hostInfosCache, + boolean doCleanup, AntiServiceMonitor antiServiceMonitor) { this.curator = curator; this.metric = metric; this.timer = timer; this.hostInfosCache = hostInfosCache; this.doCleanup = doCleanup; + this.antiServiceMonitor = antiServiceMonitor; } @Override @@ -186,6 +193,9 @@ public class ZkStatusService implements StatusService { metric.add(acquireResultMetricName, 1, metricContext); } + CriticalRegion inaccessibleDuperModelRegion = antiServiceMonitor + .disallowDuperModelLockAcquisition(ZkStatusService.class.getSimpleName() + " application lock"); + return () -> { try { lock.close(); @@ -197,6 +207,8 @@ public class ZkStatusService implements StatusService { e); } + inaccessibleDuperModelRegion.close(); + Instant lockReleasedTime = timer.currentTime(); double seconds = durationInSeconds(acquireEndTime, lockReleasedTime); metric.set("orchestrator.lock.hold-latency", seconds, metricContext); diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyAntiServiceMonitor.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyAntiServiceMonitor.java new file mode 100644 index 00000000000..d22d99b5c9c --- /dev/null +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyAntiServiceMonitor.java @@ -0,0 +1,15 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.orchestrator; + +import com.yahoo.vespa.service.monitor.AntiServiceMonitor; +import com.yahoo.vespa.service.monitor.CriticalRegion; + +/** + * @author hakonhall + */ +public class DummyAntiServiceMonitor implements AntiServiceMonitor { + @Override + public CriticalRegion disallowDuperModelLockAcquisition(String regionDescription) { + return () -> {}; + } +} diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyServiceMonitor.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyServiceMonitor.java index 0da4c48d408..a784187fb62 100644 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyServiceMonitor.java +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/DummyServiceMonitor.java @@ -15,6 +15,8 @@ import com.yahoo.vespa.applicationmodel.ServiceType; import com.yahoo.vespa.applicationmodel.TenantId; import com.yahoo.vespa.orchestrator.model.NodeGroup; import com.yahoo.vespa.orchestrator.model.VespaModelUtil; +import com.yahoo.vespa.service.monitor.AntiServiceMonitor; +import com.yahoo.vespa.service.monitor.CriticalRegion; import com.yahoo.vespa.service.monitor.ServiceModel; import com.yahoo.vespa.service.monitor.ServiceMonitor; @@ -31,7 +33,7 @@ import java.util.stream.Collectors; * @author oyving * @author smorgrav */ -public class DummyServiceMonitor implements ServiceMonitor { +public class DummyServiceMonitor implements ServiceMonitor, AntiServiceMonitor { public static final HostName TEST1_HOST_NAME = new HostName("test1.hostname.tld"); public static final HostName TEST3_HOST_NAME = new HostName("test3.hostname.tld"); @@ -165,6 +167,11 @@ public class DummyServiceMonitor implements ServiceMonitor { throw new UnsupportedOperationException(); } + @Override + public CriticalRegion disallowDuperModelLockAcquisition(String regionDescription) { + return () -> {}; + } + public static Set<HostName> getContentHosts(ApplicationInstanceReference appRef) { Set<HostName> hosts = apps.stream() .filter(application -> application.reference().equals(appRef)) diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorImplTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorImplTest.java index 44e11fb794b..f49dc490fe6 100644 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorImplTest.java +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorImplTest.java @@ -76,7 +76,12 @@ public class OrchestratorImplTest { private final ApplicationApiFactory applicationApiFactory = new ApplicationApiFactory(3); private final InMemoryFlagSource flagSource = new InMemoryFlagSource(); private final MockCurator curator = new MockCurator(); - private ZkStatusService statusService = new ZkStatusService(curator, mock(Metric.class), new TestTimer(), flagSource); + private ZkStatusService statusService = new ZkStatusService( + curator, + mock(Metric.class), + new TestTimer(), + flagSource, + new DummyAntiServiceMonitor()); private ApplicationId app1; private ApplicationId app2; @@ -394,7 +399,12 @@ public class OrchestratorImplTest { @Test public void testGetHost() throws Exception { ClusterControllerClientFactory clusterControllerClientFactory = new ClusterControllerClientFactoryMock(); - StatusService statusService = new ZkStatusService(new MockCurator(), mock(Metric.class), new TestTimer(), flagSource); + StatusService statusService = new ZkStatusService( + new MockCurator(), + mock(Metric.class), + new TestTimer(), + flagSource, + new DummyAntiServiceMonitor()); HostName hostName = new HostName("host.yahoo.com"); TenantId tenantId = new TenantId("tenant"); diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ModelTestUtils.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ModelTestUtils.java index e6a14f75b24..88e9d1c90fe 100644 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ModelTestUtils.java +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/model/ModelTestUtils.java @@ -17,6 +17,7 @@ import com.yahoo.vespa.applicationmodel.ServiceType; import com.yahoo.vespa.applicationmodel.TenantId; import com.yahoo.vespa.curator.mock.MockCurator; import com.yahoo.vespa.flags.InMemoryFlagSource; +import com.yahoo.vespa.orchestrator.DummyAntiServiceMonitor; import com.yahoo.vespa.orchestrator.OrchestrationException; import com.yahoo.vespa.orchestrator.Orchestrator; import com.yahoo.vespa.orchestrator.OrchestratorContext; @@ -57,9 +58,13 @@ class ModelTestUtils { private final Map<ApplicationInstanceReference, ApplicationInstance> applications = new HashMap<>(); private final ClusterControllerClientFactory clusterControllerClientFactory = new ClusterControllerClientFactoryMock(); private final Map<HostName, HostStatus> hostStatusMap = new HashMap<>(); - private final StatusService statusService = new ZkStatusService(new MockCurator(), mock(Metric.class), new TestTimer(), flagSource); - private final TestTimer timer = new TestTimer(); private final ServiceMonitor serviceMonitor = () -> new ServiceModel(applications); + private final StatusService statusService = new ZkStatusService( + new MockCurator(), + mock(Metric.class), + new TestTimer(), + flagSource, + new DummyAntiServiceMonitor()); private final Orchestrator orchestrator = new OrchestratorImpl(new HostedVespaPolicy(new HostedVespaClusterPolicy(), clusterControllerClientFactory, applicationApiFactory()), clusterControllerClientFactory, statusService, diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/resources/HostResourceTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/resources/HostResourceTest.java index 1bf3b267a05..ffb79b4edf6 100644 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/resources/HostResourceTest.java +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/resources/HostResourceTest.java @@ -19,6 +19,7 @@ import com.yahoo.vespa.curator.mock.MockCurator; import com.yahoo.vespa.flags.InMemoryFlagSource; import com.yahoo.vespa.orchestrator.BatchHostNameNotFoundException; import com.yahoo.vespa.orchestrator.BatchInternalErrorException; +import com.yahoo.vespa.orchestrator.DummyAntiServiceMonitor; import com.yahoo.vespa.orchestrator.Host; import com.yahoo.vespa.orchestrator.HostNameNotFoundException; import com.yahoo.vespa.orchestrator.OrchestrationException; @@ -79,11 +80,12 @@ public class HostResourceTest { private static final int SERVICE_MONITOR_CONVERGENCE_LATENCY_SECONDS = 0; private static final TenantId TENANT_ID = new TenantId("tenantId"); private static final ApplicationInstanceId APPLICATION_INSTANCE_ID = new ApplicationInstanceId("applicationId"); + private static final ServiceMonitor serviceMonitor = mock(ServiceMonitor.class); private static final StatusService EVERY_HOST_IS_UP_HOST_STATUS_SERVICE = new ZkStatusService( - new MockCurator(), mock(Metric.class), new TestTimer(), new InMemoryFlagSource()); + new MockCurator(), mock(Metric.class), new TestTimer(), new InMemoryFlagSource(), + new DummyAntiServiceMonitor()); private static final ApplicationApiFactory applicationApiFactory = new ApplicationApiFactory(3); - private static final ServiceMonitor serviceMonitor = mock(ServiceMonitor.class); static { when(serviceMonitor.getApplication(any(HostName.class))) .thenReturn(Optional.of( diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusService2Test.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusService2Test.java index 4c01572f787..e8cfe15611e 100644 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusService2Test.java +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusService2Test.java @@ -9,6 +9,8 @@ import com.yahoo.vespa.applicationmodel.ApplicationInstanceReference; import com.yahoo.vespa.applicationmodel.TenantId; import com.yahoo.vespa.curator.Curator; import com.yahoo.vespa.orchestrator.OrchestratorContext; +import com.yahoo.vespa.service.monitor.AntiServiceMonitor; +import com.yahoo.vespa.service.monitor.CriticalRegion; import org.apache.curator.framework.recipes.locks.InterProcessMutex; import org.junit.Test; import org.mockito.ArgumentCaptor; @@ -32,7 +34,10 @@ public class ZkStatusService2Test { private final Timer timer = new TestTimer(); private final Metric metric = mock(Metric.class); private final HostInfosCache cache = mock(HostInfosCache.class); - private final ZkStatusService zkStatusService = new ZkStatusService(curator, metric, timer, cache, false); + private final CriticalRegion criticalRegion = mock(CriticalRegion.class); + private final AntiServiceMonitor antiServiceMonitor = mock(AntiServiceMonitor.class); + private final ZkStatusService zkStatusService = + new ZkStatusService(curator, metric, timer, cache, false, antiServiceMonitor); private final OrchestratorContext context = mock(OrchestratorContext.class); private final InterProcessMutex mutex = mock(InterProcessMutex.class); @@ -47,34 +52,43 @@ public class ZkStatusService2Test { when(curator.createMutex(any())).thenReturn(mutex); when(mutex.acquire(anyLong(), any())).thenReturn(true); + when(antiServiceMonitor.disallowDuperModelLockAcquisition(any())).thenReturn(criticalRegion); when(context.getTimeLeft()).thenReturn(Duration.ofSeconds(12)); + verify(antiServiceMonitor, times(0)).disallowDuperModelLockAcquisition(any()); try (ApplicationLock lock = zkStatusService.lockApplication(context, reference)) { - // nothing + verify(antiServiceMonitor, times(1)).disallowDuperModelLockAcquisition(any()); + + verify(criticalRegion, times(0)).close(); } + verify(criticalRegion, times(1)).close(); verify(curator, times(1)).createMutex(any()); verify(mutex, times(1)).acquire(anyLong(), any()); verify(mutex, times(1)).release(); verify(context, times(1)).hasLock(any()); verify(context, times(1)).registerLockAcquisition(any(), any()); - verifyNoMoreInteractions(mutex); + verifyNoMoreInteractions(mutex, antiServiceMonitor, criticalRegion); // Now the non-probe suspension when(context.isProbe()).thenReturn(false); + verify(antiServiceMonitor, times(1)).disallowDuperModelLockAcquisition(any()); try (ApplicationLock lock = zkStatusService.lockApplication(context, reference)) { - // nothing + verify(antiServiceMonitor, times(2)).disallowDuperModelLockAcquisition(any()); + + verify(criticalRegion, times(1)).close(); } + verify(criticalRegion, times(2)).close(); verify(mutex, times(2)).acquire(anyLong(), any()); verify(mutex, times(2)).release(); ArgumentCaptor<Runnable> runnableCaptor = ArgumentCaptor.forClass(Runnable.class); verify(context, times(2)).hasLock(any()); verify(context, times(2)).registerLockAcquisition(any(), any()); - verifyNoMoreInteractions(mutex); + verifyNoMoreInteractions(mutex, antiServiceMonitor, criticalRegion); } @Test @@ -85,6 +99,7 @@ public class ZkStatusService2Test { when(curator.createMutex(any())).thenReturn(mutex); when(mutex.acquire(anyLong(), any())).thenReturn(true); + when(antiServiceMonitor.disallowDuperModelLockAcquisition(any())).thenReturn(criticalRegion); when(context.getTimeLeft()).thenReturn(Duration.ofSeconds(12)); @@ -97,7 +112,9 @@ public class ZkStatusService2Test { verify(mutex, times(0)).release(); verify(context, times(1)).hasLock(any()); verify(context, times(1)).registerLockAcquisition(any(), any()); - verifyNoMoreInteractions(mutex); + verify(antiServiceMonitor, times(1)).disallowDuperModelLockAcquisition(any()); + verify(criticalRegion, times(0)).close(); + verifyNoMoreInteractions(mutex, antiServiceMonitor, criticalRegion); // Now the non-probe suspension @@ -114,7 +131,9 @@ public class ZkStatusService2Test { verify(mutex, times(0)).release(); verify(context, times(2)).hasLock(any()); verify(context, times(1)).registerLockAcquisition(any(), any()); - verifyNoMoreInteractions(mutex); + verify(antiServiceMonitor, times(1)).disallowDuperModelLockAcquisition(any()); + verify(criticalRegion, times(0)).close(); + verifyNoMoreInteractions(mutex, antiServiceMonitor, criticalRegion); // Verify the context runnable releases the mutex ArgumentCaptor<Runnable> runnableCaptor = ArgumentCaptor.forClass(Runnable.class); @@ -123,5 +142,7 @@ public class ZkStatusService2Test { runnableCaptor.getAllValues().forEach(Runnable::run); verify(mutex, times(1)).acquire(anyLong(), any()); verify(mutex, times(1)).release(); + verify(criticalRegion, times(1)).close(); + verifyNoMoreInteractions(mutex, antiServiceMonitor, criticalRegion); } }
\ No newline at end of file diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusServiceTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusServiceTest.java index 3c3c337f4e1..9fe910079cf 100644 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusServiceTest.java +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZkStatusServiceTest.java @@ -16,6 +16,9 @@ import com.yahoo.vespa.flags.InMemoryFlagSource; import com.yahoo.vespa.orchestrator.OrchestratorContext; import com.yahoo.vespa.orchestrator.OrchestratorUtil; import com.yahoo.vespa.orchestrator.TestIds; +import com.yahoo.vespa.service.monitor.AntiServiceMonitor; +import com.yahoo.vespa.service.monitor.CriticalRegion; +import com.yahoo.vespa.service.monitor.ServiceMonitor; import com.yahoo.yolean.Exceptions; import org.apache.curator.framework.CuratorFramework; import org.apache.curator.test.KillSession; @@ -57,6 +60,7 @@ import static org.junit.Assert.fail; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; @@ -68,7 +72,10 @@ public class ZkStatusServiceTest { private final Timer timer = mock(Timer.class); private final Metric metric = mock(Metric.class); private final OrchestratorContext context = mock(OrchestratorContext.class); - private InMemoryFlagSource flagSource = new InMemoryFlagSource(); + private final InMemoryFlagSource flagSource = new InMemoryFlagSource(); + private final ServiceMonitor serviceMonitor = mock(ServiceMonitor.class); + private final CriticalRegion criticalRegion = mock(CriticalRegion.class); + private final AntiServiceMonitor antiServiceMonitor = mock(AntiServiceMonitor.class); @Captor private ArgumentCaptor<Map<String, String>> captor; @@ -78,11 +85,12 @@ public class ZkStatusServiceTest { testingServer = new TestingServer(); curator = createConnectedCurator(testingServer); - statusService = new ZkStatusService(curator, metric, timer, flagSource); + statusService = new ZkStatusService(curator, metric, timer, flagSource, antiServiceMonitor); when(context.getTimeLeft()).thenReturn(Duration.ofSeconds(10)); when(context.isProbe()).thenReturn(false); when(timer.currentTime()).thenReturn(Instant.ofEpochMilli(1)); when(timer.toUtcClock()).thenReturn(new ManualClock(Instant.ofEpochMilli(1))); + when(antiServiceMonitor.disallowDuperModelLockAcquisition(any())).thenReturn(criticalRegion); } private static Curator createConnectedCurator(TestingServer server) throws InterruptedException { @@ -151,7 +159,7 @@ public class ZkStatusServiceTest { public void locks_are_exclusive() throws Exception { setUp(); ZkStatusService zkStatusService2 = - new ZkStatusService(curator, mock(Metric.class), new TestTimer(), flagSource); + new ZkStatusService(curator, mock(Metric.class), new TestTimer(), flagSource, antiServiceMonitor); final CompletableFuture<Void> lockedSuccessfullyFuture; try (ApplicationLock lock = statusService @@ -178,7 +186,7 @@ public class ZkStatusServiceTest { public void failing_to_get_lock_closes_SessionFailRetryLoop() throws Exception { setUp(); ZkStatusService zkStatusService2 = - new ZkStatusService(curator, mock(Metric.class), new TestTimer(), flagSource); + new ZkStatusService(curator, mock(Metric.class), new TestTimer(), flagSource, antiServiceMonitor); try (ApplicationLock lock = statusService .lockApplication(context, TestIds.APPLICATION_INSTANCE_REFERENCE)) { @@ -309,16 +317,28 @@ public class ZkStatusServiceTest { HostName strayHostname = new HostName("stray1.com"); + verify(antiServiceMonitor, times(0)).disallowDuperModelLockAcquisition(any()); try (ApplicationLock lock = statusService.lockApplication(context, TestIds.APPLICATION_INSTANCE_REFERENCE)) { + verify(antiServiceMonitor, times(1)).disallowDuperModelLockAcquisition(any()); + lock.setApplicationInstanceStatus(ApplicationInstanceStatus.ALLOWED_TO_BE_DOWN); lock.setHostState(TestIds.HOST_NAME1, HostStatus.ALLOWED_TO_BE_DOWN); lock.setHostState(strayHostname, HostStatus.PERMANENTLY_DOWN); + + verify(criticalRegion, times(0)).close(); } + verify(criticalRegion, times(1)).close(); + verify(antiServiceMonitor, times(1)).disallowDuperModelLockAcquisition(any()); try (ApplicationLock lock = statusService.lockApplication(context, TestIds.APPLICATION_INSTANCE_REFERENCE2)) { + verify(antiServiceMonitor, times(2)).disallowDuperModelLockAcquisition(any()); + lock.setApplicationInstanceStatus(ApplicationInstanceStatus.ALLOWED_TO_BE_DOWN); + + verify(criticalRegion, times(1)).close(); } + verify(criticalRegion, times(2)).close(); ApplicationId applicationId = OrchestratorUtil.toApplicationId(TestIds.APPLICATION_INSTANCE_REFERENCE); assertEquals("test-tenant:test-application:test-environment:test-region:test-instance-key", |