summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt1
-rw-r--r--clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java1
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ActivateClusterStateVersionRequest.java13
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateBundle.java80
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateVersionSpecificRequest.java67
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/Communicator.java2
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java2
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java32
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java3
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeInfo.java89
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SetClusterStateRequest.java44
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateBroadcaster.java258
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/listeners/SystemStateListener.java8
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/RPCActivateClusterStateVersionRequest.java20
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/RPCActivateClusterStateVersionWaiter.java47
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/RPCCommunicator.java32
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/SlimeClusterStateBundleCodec.java6
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java2
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/statuspage/VdsClusterHtmlRendrer.java265
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateBundleTest.java83
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateBundleUtil.java6
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ContentClusterHtmlRendrerTest.java7
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DatabaseTest.java4
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DistributionBitCountTest.java2
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyCommunicator.java19
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyVdsNode.java70
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyVdsNodeOptions.java4
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FleetControllerTest.java12
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownLiveConfigTest.java2
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/MasterElectionTest.java20
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NoZooKeeperTest.java2
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeSlobrokConfigurationMembershipTest.java2
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/RpcServerTest.java30
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/RpcVersionAutoDowngradeTest.java13
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/SlobrokTest.java4
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java66
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateGatherTest.java2
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StatusPagesTest.java12
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/SystemStateBroadcasterTest.java223
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/WantedStateTest.java4
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/rpc/RPCCommunicatorTest.java31
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/rpc/SlimeClusterStateBundleCodecTest.java16
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/testutils/WaitCondition.java13
-rw-r--r--config-model/src/main/java/com/yahoo/searchdefinition/MapEvaluationTypeContext.java20
-rw-r--r--config-model/src/main/java/com/yahoo/searchdefinition/RankProfile.java2
-rw-r--r--config-model/src/main/java/com/yahoo/vespa/model/ml/ConvertedModel.java8
-rw-r--r--config-model/src/test/java/com/yahoo/searchdefinition/derived/GeminiTestCase.java4
-rw-r--r--configdefinitions/src/vespa/fleetcontroller.def12
-rw-r--r--container-core/src/main/java/com/yahoo/restapi/Path.java2
-rw-r--r--controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/zone/ZoneRegistry.java3
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java7
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/proxy/ConfigServerRestExecutorImpl.java2
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/role/PathGroup.java9
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/versions/OsVersion.java12
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/ZoneRegistryMock.java19
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment.json1
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/prod-us-central-1.json1
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/user/UserApiTest.java2
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/role/PathGroupTest.java7
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/role/RoleMembershipTest.java3
-rw-r--r--defaults/abi-spec.json1
-rw-r--r--defaults/src/main/java/com/yahoo/vespa/defaults/Defaults.java17
-rw-r--r--defaults/src/test/java/com/yahoo/vespa/defaults/DefaultsTestCase.java18
-rw-r--r--docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/DockerImpl.java20
-rw-r--r--document/src/vespa/document/select/operator.cpp10
-rw-r--r--flags/src/main/java/com/yahoo/vespa/flags/Flags.java5
-rw-r--r--logd/CMakeLists.txt1
-rw-r--r--logd/src/logd/CMakeLists.txt4
-rw-r--r--logd/src/logd/legacy_forwarder.cpp121
-rw-r--r--logd/src/logd/legacy_forwarder.h11
-rw-r--r--logd/src/logd/log_protocol_proto.h11
-rw-r--r--logd/src/logd/proto_converter.cpp65
-rw-r--r--logd/src/logd/proto_converter.h20
-rw-r--r--logd/src/tests/proto_converter/CMakeLists.txt9
-rw-r--r--logd/src/tests/proto_converter/proto_converter_test.cpp88
-rw-r--r--metrics/src/main/java/com/yahoo/metrics/Metric.java2
-rw-r--r--metrics/src/main/java/com/yahoo/metrics/MetricSet.java6
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java15
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java2
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java10
-rw-r--r--pom.xml3
-rw-r--r--searchcore/src/tests/proton/docsummary/docsummary.cpp4
-rw-r--r--searchlib/src/main/java/com/yahoo/searchlib/rankingexpression/Reference.java12
-rw-r--r--simplemetrics/src/main/java/com/yahoo/metrics/simple/Bucket.java2
-rw-r--r--simplemetrics/src/main/java/com/yahoo/metrics/simple/DimensionCache.java14
-rw-r--r--simplemetrics/src/main/java/com/yahoo/metrics/simple/Measurement.java2
-rw-r--r--simplemetrics/src/main/java/com/yahoo/metrics/simple/MetricAggregator.java8
-rw-r--r--simplemetrics/src/main/java/com/yahoo/metrics/simple/MetricReceiver.java2
-rw-r--r--simplemetrics/src/main/java/com/yahoo/metrics/simple/Point.java1
-rw-r--r--simplemetrics/src/main/java/com/yahoo/metrics/simple/Sample.java3
-rw-r--r--simplemetrics/src/main/java/com/yahoo/metrics/simple/jdisc/SimpleMetricConsumer.java2
-rw-r--r--storage/src/tests/distributor/bucketdbupdatertest.cpp304
-rw-r--r--storage/src/tests/distributor/distributortestutil.cpp10
-rw-r--r--storage/src/tests/distributor/distributortestutil.h2
-rw-r--r--storage/src/tests/distributor/externaloperationhandlertest.cpp162
-rw-r--r--storage/src/tests/storageserver/bouncertest.cpp14
-rw-r--r--storage/src/tests/storageserver/fnet_listener_test.cpp88
-rw-r--r--storage/src/tests/storageserver/statemanagertest.cpp30
-rw-r--r--storage/src/vespa/storage/config/distributorconfiguration.cpp2
-rw-r--r--storage/src/vespa/storage/config/distributorconfiguration.h8
-rw-r--r--storage/src/vespa/storage/config/stor-distributormanager.def7
-rw-r--r--storage/src/vespa/storage/distributor/bucketdbupdater.cpp127
-rw-r--r--storage/src/vespa/storage/distributor/bucketdbupdater.h21
-rw-r--r--storage/src/vespa/storage/distributor/distributor.cpp19
-rw-r--r--storage/src/vespa/storage/distributor/distributor.h11
-rw-r--r--storage/src/vespa/storage/distributor/distributorcomponent.cpp8
-rw-r--r--storage/src/vespa/storage/distributor/distributorcomponent.h15
-rw-r--r--storage/src/vespa/storage/distributor/distributormetricsset.cpp2
-rw-r--r--storage/src/vespa/storage/distributor/distributormetricsset.h2
-rw-r--r--storage/src/vespa/storage/distributor/externaloperationhandler.cpp136
-rw-r--r--storage/src/vespa/storage/distributor/externaloperationhandler.h15
-rw-r--r--storage/src/vespa/storage/distributor/idealstatemanager.cpp3
-rw-r--r--storage/src/vespa/storage/distributor/idealstatemanager.h1
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/getoperation.h3
-rw-r--r--storage/src/vespa/storage/distributor/pendingclusterstate.cpp12
-rw-r--r--storage/src/vespa/storage/distributor/pendingclusterstate.h55
-rw-r--r--storage/src/vespa/storage/storageserver/bouncer.cpp1
-rw-r--r--storage/src/vespa/storage/storageserver/communicationmanager.cpp9
-rw-r--r--storage/src/vespa/storage/storageserver/fnetlistener.cpp22
-rw-r--r--storage/src/vespa/storage/storageserver/fnetlistener.h1
-rw-r--r--storage/src/vespa/storage/storageserver/slime_cluster_state_bundle_codec.cpp9
-rw-r--r--storage/src/vespa/storage/storageserver/statemanager.cpp13
-rw-r--r--storage/src/vespa/storage/storageserver/statemanager.h1
-rw-r--r--storageapi/src/vespa/storageapi/message/state.cpp36
-rw-r--r--storageapi/src/vespa/storageapi/message/state.h23
-rw-r--r--storageapi/src/vespa/storageapi/messageapi/messagehandler.h8
-rw-r--r--storageapi/src/vespa/storageapi/messageapi/storagemessage.cpp2
-rw-r--r--storageapi/src/vespa/storageapi/messageapi/storagemessage.h4
-rw-r--r--vdslib/src/vespa/vdslib/state/cluster_state_bundle.cpp33
-rw-r--r--vdslib/src/vespa/vdslib/state/cluster_state_bundle.h7
-rw-r--r--vespa-testrunner-components/CMakeLists.txt3
-rw-r--r--vespa-testrunner-components/OWNERS1
-rw-r--r--vespa-testrunner-components/README.md4
-rw-r--r--vespa-testrunner-components/pom.xml91
-rw-r--r--vespa-testrunner-components/src/main/java/com/yahoo/vespa/hosted/testrunner/PomXmlGenerator.java110
-rw-r--r--vespa-testrunner-components/src/main/java/com/yahoo/vespa/hosted/testrunner/TestProfile.java29
-rw-r--r--vespa-testrunner-components/src/main/java/com/yahoo/vespa/hosted/testrunner/TestRunner.java195
-rw-r--r--vespa-testrunner-components/src/main/java/com/yahoo/vespa/hosted/testrunner/TestRunnerHandler.java166
-rw-r--r--vespa-testrunner-components/src/main/resources/configdefinitions/test-runner.def4
-rw-r--r--vespa-testrunner-components/src/test/java/com/yahoo/vespa/hosted/testrunner/PomXmlGeneratorTest.java33
-rw-r--r--vespa-testrunner-components/src/test/java/com/yahoo/vespa/hosted/testrunner/TestRunnerHandlerTest.java37
-rw-r--r--vespa-testrunner-components/src/test/java/com/yahoo/vespa/hosted/testrunner/TestRunnerTest.java127
-rw-r--r--vespa-testrunner-components/src/test/resources/pom.xml_system_tests72
-rwxr-xr-xvespabase/src/rhel-prestart.sh1
-rw-r--r--vespalog/src/vespa/log/log_message.cpp19
-rw-r--r--vespalog/src/vespa/log/log_message.h8
147 files changed, 3590 insertions, 762 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8456b79f124..eba69f6ed02 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -110,6 +110,7 @@ add_subdirectory(vdslib)
add_subdirectory(vdstestlib)
add_subdirectory(vespa-athenz)
add_subdirectory(vespa-http-client)
+add_subdirectory(vespa-testrunner-components)
add_subdirectory(vespa_feed_perf)
add_subdirectory(vespa_jersey2)
add_subdirectory(vespabase)
diff --git a/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java b/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java
index b15cb2ad399..c95d814eb99 100644
--- a/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java
+++ b/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java
@@ -75,6 +75,7 @@ public class ClusterControllerClusterConfigurer {
options.setMaxDeferredTaskVersionWaitTime(Duration.ofMillis((int)(config.max_deferred_task_version_wait_time_sec() * 1000)));
options.clusterHasGlobalDocumentTypes = config.cluster_has_global_document_types();
options.minMergeCompletionRatio = config.min_merge_completion_ratio();
+ options.enableTwoPhaseClusterStateActivation = config.enable_two_phase_cluster_state_transitions();
}
private void configure(SlobroksConfig config) {
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ActivateClusterStateVersionRequest.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ActivateClusterStateVersionRequest.java
new file mode 100644
index 00000000000..26d63f7ba60
--- /dev/null
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ActivateClusterStateVersionRequest.java
@@ -0,0 +1,13 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.clustercontroller.core;
+
+/**
+ * Wrapper for a cluster state activation request towards a single node.
+ */
+public class ActivateClusterStateVersionRequest extends ClusterStateVersionSpecificRequest {
+
+ public ActivateClusterStateVersionRequest(NodeInfo nodeInfo, int systemStateVersion) {
+ super(nodeInfo, systemStateVersion);
+ }
+
+}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateBundle.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateBundle.java
index 76177e8f1c1..fc06fef5b30 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateBundle.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateBundle.java
@@ -19,16 +19,23 @@ import java.util.stream.Collectors;
*
* The baseline state is identical to the legacy, global cluster state that the
* cluster controller has historically produced as its only output.
+ *
+ * The bundle also contains an additional "deferred activation" flag which tells
+ * the recipient if the cluster state transition should complete immediately or
+ * await an explicit activation RPC from the cluster controller.
*/
public class ClusterStateBundle {
private final AnnotatedClusterState baselineState;
private final Map<String, AnnotatedClusterState> derivedBucketSpaceStates;
+ private final boolean deferredActivation;
public static class Builder {
private final AnnotatedClusterState baselineState;
+ private Map<String, AnnotatedClusterState> explicitDerivedStates;
private ClusterStateDeriver stateDeriver;
private Set<String> bucketSpaces;
+ private boolean deferredActivation = false;
public Builder(AnnotatedClusterState baselineState) {
this.baselineState = baselineState;
@@ -40,30 +47,59 @@ public class ClusterStateBundle {
}
public Builder bucketSpaces(Set<String> bucketSpaces) {
+ if (this.explicitDerivedStates != null) {
+ throw new IllegalStateException("Cannot set bucket spaces on Builder that already " +
+ "has explicit derived states set");
+ }
this.bucketSpaces = bucketSpaces;
return this;
}
public Builder bucketSpaces(String... bucketSpaces) {
- this.bucketSpaces = new TreeSet<>(Arrays.asList(bucketSpaces));
+ return bucketSpaces(new TreeSet<>(Arrays.asList(bucketSpaces)));
+ }
+
+ public Builder explicitDerivedStates(Map<String, AnnotatedClusterState> derivedStates) {
+ if (this.bucketSpaces != null || this.stateDeriver != null) {
+ throw new IllegalStateException("Cannot set explicitly derived states on Builder " +
+ "that already has bucket spaces or deriver set");
+ }
+ this.explicitDerivedStates = derivedStates;
+ return this;
+ }
+
+ public Builder deferredActivation(boolean deferred) {
+ this.deferredActivation = deferred;
return this;
}
public ClusterStateBundle deriveAndBuild() {
- if (stateDeriver == null || bucketSpaces == null || bucketSpaces.isEmpty()) {
- return ClusterStateBundle.ofBaselineOnly(baselineState);
+ if ((stateDeriver == null || bucketSpaces == null || bucketSpaces.isEmpty()) && explicitDerivedStates == null) {
+ return ClusterStateBundle.ofBaselineOnly(baselineState, deferredActivation);
+ }
+ Map<String, AnnotatedClusterState> derived;
+ if (explicitDerivedStates != null) {
+ derived = explicitDerivedStates;
+ } else {
+ derived = bucketSpaces.stream()
+ .collect(Collectors.toMap(
+ Function.identity(),
+ s -> stateDeriver.derivedFrom(baselineState, s)));
}
- Map<String, AnnotatedClusterState> derived = bucketSpaces.stream()
- .collect(Collectors.toMap(
- Function.identity(),
- s -> stateDeriver.derivedFrom(baselineState, s)));
- return new ClusterStateBundle(baselineState, derived);
+ return new ClusterStateBundle(baselineState, derived, deferredActivation);
}
}
private ClusterStateBundle(AnnotatedClusterState baselineState, Map<String, AnnotatedClusterState> derivedBucketSpaceStates) {
+ this(baselineState, derivedBucketSpaceStates, false);
+ }
+
+ private ClusterStateBundle(AnnotatedClusterState baselineState, Map<String,
+ AnnotatedClusterState> derivedBucketSpaceStates,
+ boolean deferredActivation) {
this.baselineState = baselineState;
this.derivedBucketSpaceStates = Collections.unmodifiableMap(derivedBucketSpaceStates);
+ this.deferredActivation = deferredActivation;
}
public static Builder builder(AnnotatedClusterState baselineState) {
@@ -74,6 +110,16 @@ public class ClusterStateBundle {
return new ClusterStateBundle(baselineState, derivedBucketSpaceStates);
}
+ public static ClusterStateBundle of(AnnotatedClusterState baselineState,
+ Map<String, AnnotatedClusterState> derivedBucketSpaceStates,
+ boolean deferredActivation) {
+ return new ClusterStateBundle(baselineState, derivedBucketSpaceStates, deferredActivation);
+ }
+
+ public static ClusterStateBundle ofBaselineOnly(AnnotatedClusterState baselineState, boolean deferredActivation) {
+ return new ClusterStateBundle(baselineState, Collections.emptyMap(), deferredActivation);
+ }
+
public static ClusterStateBundle ofBaselineOnly(AnnotatedClusterState baselineState) {
return new ClusterStateBundle(baselineState, Collections.emptyMap());
}
@@ -94,13 +140,15 @@ public class ClusterStateBundle {
return derivedBucketSpaceStates;
}
+ public boolean deferredActivation() { return this.deferredActivation; }
+
public ClusterStateBundle cloneWithMapper(Function<ClusterState, ClusterState> mapper) {
AnnotatedClusterState clonedBaseline = baselineState.cloneWithClusterState(
mapper.apply(baselineState.getClusterState().clone()));
Map<String, AnnotatedClusterState> clonedDerived = derivedBucketSpaceStates.entrySet().stream()
.collect(Collectors.toMap(e -> e.getKey(), e -> e.getValue().cloneWithClusterState(
mapper.apply(e.getValue().getClusterState().clone()))));
- return new ClusterStateBundle(clonedBaseline, clonedDerived);
+ return new ClusterStateBundle(clonedBaseline, clonedDerived, deferredActivation);
}
public ClusterStateBundle clonedWithVersionSet(int version) {
@@ -127,12 +175,14 @@ public class ClusterStateBundle {
@Override
public String toString() {
if (derivedBucketSpaceStates.isEmpty()) {
- return String.format("ClusterStateBundle('%s')", baselineState);
+ return String.format("ClusterStateBundle('%s'%s)", baselineState,
+ deferredActivation ? " (deferred activation)" : "");
}
Map<String, AnnotatedClusterState> orderedStates = new TreeMap<>(derivedBucketSpaceStates);
- return String.format("ClusterStateBundle('%s', %s)", baselineState, orderedStates.entrySet().stream()
+ return String.format("ClusterStateBundle('%s', %s%s)", baselineState, orderedStates.entrySet().stream()
.map(e -> String.format("%s '%s'", e.getKey(), e.getValue()))
- .collect(Collectors.joining(", ")));
+ .collect(Collectors.joining(", ")),
+ deferredActivation ? " (deferred activation)" : "");
}
@Override
@@ -140,13 +190,13 @@ public class ClusterStateBundle {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
ClusterStateBundle that = (ClusterStateBundle) o;
- return Objects.equals(baselineState, that.baselineState) &&
+ return deferredActivation == that.deferredActivation &&
+ Objects.equals(baselineState, that.baselineState) &&
Objects.equals(derivedBucketSpaceStates, that.derivedBucketSpaceStates);
}
@Override
public int hashCode() {
- return Objects.hash(baselineState, derivedBucketSpaceStates);
+ return Objects.hash(baselineState, derivedBucketSpaceStates, deferredActivation);
}
-
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateVersionSpecificRequest.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateVersionSpecificRequest.java
new file mode 100644
index 00000000000..9e8abc0608e
--- /dev/null
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateVersionSpecificRequest.java
@@ -0,0 +1,67 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.clustercontroller.core;
+
+/**
+ * Base class for distributor/content node node RPC requests that are bound
+ * to a particular cluster state version.
+ */
+public abstract class ClusterStateVersionSpecificRequest {
+
+ private final NodeInfo nodeInfo;
+ private final int clusterStateVersion;
+ private Reply reply;
+
+ public ClusterStateVersionSpecificRequest(NodeInfo nodeInfo, int clusterStateVersion) {
+ this.nodeInfo = nodeInfo;
+ this.clusterStateVersion = clusterStateVersion;
+ }
+
+ public NodeInfo getNodeInfo() { return nodeInfo; }
+
+ public int getClusterStateVersion() { return clusterStateVersion; }
+
+ public void setReply(Reply reply) { this.reply = reply; }
+
+ public Reply getReply() { return reply; }
+
+ public static class Reply {
+
+ final int returnCode;
+ final String returnMessage;
+ final int actualVersion;
+
+ public Reply() {
+ this(0, null);
+ }
+
+ public Reply(int returnCode, String returnMessage) {
+ this.returnCode = returnCode;
+ this.returnMessage = returnMessage;
+ this.actualVersion = -1;
+ }
+
+ private Reply(int actualVersion) {
+ this.returnCode = 0;
+ this.returnMessage = null;
+ this.actualVersion = actualVersion;
+ }
+
+ public static Reply withActualVersion(int version) {
+ return new Reply(version);
+ }
+
+ /** Returns whether this is an error response */
+ public boolean isError() { return returnCode != 0; }
+
+ /** Returns the return code, which is 0 if this request was successful */
+ public int getReturnCode() { return returnCode; }
+
+ /** Returns the message returned, or null if none */
+ public String getReturnMessage() { return returnMessage; }
+
+ /** Returns actual cluster state version active on node, or -1 if reply does not contain this information */
+ public int getActualVersion() { return actualVersion; }
+
+ }
+
+}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/Communicator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/Communicator.java
index 450513343b0..900eee54cd3 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/Communicator.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/Communicator.java
@@ -20,6 +20,8 @@ public interface Communicator {
void setSystemState(ClusterStateBundle states, NodeInfo node, Waiter<SetClusterStateRequest> waiter);
+ void activateClusterStateVersion(int clusterStateVersion, NodeInfo node, Waiter<ActivateClusterStateVersionRequest> waiter);
+
void shutdown();
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java
index 6ff297b4a31..43412311436 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java
@@ -45,7 +45,7 @@ public class ContentCluster {
final VdsClusterHtmlRendrer vdsClusterHtmlRendrer,
final StringBuilder sb,
final Timer timer,
- final ClusterState state,
+ final ClusterStateBundle state,
final ClusterStatsAggregator statsAggregator,
final Distribution distribution,
final FleetControllerOptions options,
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
index 005bf7971a5..ba35243c14d 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
@@ -76,6 +76,7 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
private boolean waitingForCycle = false;
private StatusPageServer.PatternRequestRouter statusRequestRouter = new StatusPageServer.PatternRequestRouter();
private final List<ClusterStateBundle> newStates = new ArrayList<>();
+ private final List<ClusterStateBundle> convergedStates = new ArrayList<>();
private long configGeneration = -1;
private long nextConfigGeneration = -1;
private Queue<RemoteClusterControllerTask> remoteTasks = new LinkedList<>();
@@ -253,6 +254,10 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
throw new NullPointerException("Cluster state should never be null at this point");
}
listener.handleNewPublishedState(ClusterStateBundle.ofBaselineOnly(AnnotatedClusterState.withoutAnnotations(state)));
+ ClusterStateBundle convergedState = systemStateBroadcaster.getLastClusterStateBundleConverged();
+ if (convergedState != null) {
+ listener.handleStateConvergedInCluster(convergedState);
+ }
}
public FleetControllerOptions getOptions() {
@@ -435,9 +440,11 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
/** Called when all distributors have acked newest cluster state version. */
public void handleAllDistributorsInSync(DatabaseHandler database, DatabaseHandler.Context context) throws InterruptedException {
Set<ConfiguredNode> nodes = new HashSet<>(cluster.clusterInfo().getConfiguredNodes().values());
- ClusterState currentState = stateVersionTracker.getVersionedClusterState();
- log.fine(() -> String.format("All distributors have ACKed cluster state version %d", currentState.getVersion()));
- stateChangeHandler.handleAllDistributorsInSync(currentState, nodes, database, context);
+ // TODO wouldn't it be better to always get bundle information from the state broadcaster?
+ var currentBundle = stateVersionTracker.getVersionedClusterStateBundle();
+ log.fine(() -> String.format("All distributors have ACKed cluster state version %d", currentBundle.getVersion()));
+ stateChangeHandler.handleAllDistributorsInSync(currentBundle.getBaselineClusterState(), nodes, database, context);
+ convergedStates.add(currentBundle);
}
private boolean changesConfiguredNodeSet(Collection<ConfiguredNode> newNodes) {
@@ -666,12 +673,14 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
// Reset timer to only see warning once.
firstAllowedStateBroadcast = currentTime;
}
- sentAny = systemStateBroadcaster.broadcastNewState(databaseContext, communicator);
+ sentAny = systemStateBroadcaster.broadcastNewStateBundleIfRequired(databaseContext, communicator);
if (sentAny) {
// FIXME won't this inhibit resending to unresponsive nodes?
nextStateSendTime = currentTime + options.minTimeBetweenNewSystemStates;
}
}
+ // Always allow activations if we've already broadcasted a state
+ sentAny |= systemStateBroadcaster.broadcastStateActivationsIfRequired(databaseContext, communicator);
return sentAny;
}
@@ -679,13 +688,23 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
if ( ! newStates.isEmpty()) {
synchronized (systemStateListeners) {
for (ClusterStateBundle stateBundle : newStates) {
- for(SystemStateListener listener : systemStateListeners) {
+ for (SystemStateListener listener : systemStateListeners) {
listener.handleNewPublishedState(stateBundle);
}
}
newStates.clear();
}
}
+ if ( ! convergedStates.isEmpty()) {
+ synchronized (systemStateListeners) {
+ for (ClusterStateBundle stateBundle : convergedStates) {
+ for (SystemStateListener listener : systemStateListeners) {
+ listener.handleStateConvergedInCluster(stateBundle);
+ }
+ }
+ convergedStates.clear();
+ }
+ }
}
private boolean processNextQueuedRemoteTask() {
@@ -822,6 +841,7 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
final ClusterStateBundle candidateBundle = ClusterStateBundle.builder(candidate)
.bucketSpaces(configuredBucketSpaces)
.stateDeriver(createBucketSpaceStateDeriver())
+ .deferredActivation(options.enableTwoPhaseClusterStateActivation)
.deriveAndBuild();
stateVersionTracker.updateLatestCandidateStateBundle(candidateBundle);
invokeCandidateStateListeners(candidateBundle);
@@ -1046,7 +1066,7 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
while (true) {
int ackedNodes = 0;
for (NodeInfo node : cluster.getNodeInfo()) {
- if (node.getSystemStateVersionAcknowledged() >= version) {
+ if (node.getClusterStateVersionBundleAcknowledged() >= version) {
++ackedNodes;
}
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java
index e069dde1901..f49b626d347 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java
@@ -121,6 +121,8 @@ public class FleetControllerOptions implements Cloneable {
public boolean clusterHasGlobalDocumentTypes = false;
+ public boolean enableTwoPhaseClusterStateActivation = false;
+
// TODO: Choose a default value
public double minMergeCompletionRatio = 1.0;
@@ -231,6 +233,7 @@ public class FleetControllerOptions implements Cloneable {
sb.append("<tr><td><nobr>Wanted distribution bits</nobr></td><td align=\"right\">").append(distributionBits).append("</td></tr>");
sb.append("<tr><td><nobr>Max deferred task version wait time</nobr></td><td align=\"right\">").append(maxDeferredTaskVersionWaitTime.toMillis()).append("ms</td></tr>");
sb.append("<tr><td><nobr>Cluster has global document types configured</nobr></td><td align=\"right\">").append(clusterHasGlobalDocumentTypes).append("</td></tr>");
+ sb.append("<tr><td><nobr>Enable 2-phase cluster state activation protocol</nobr></td><td align=\"right\">").append(enableTwoPhaseClusterStateActivation).append("</td></tr>");
sb.append("</table>");
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeInfo.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeInfo.java
index 54cf2dad00a..82d13e2d9ef 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeInfo.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeInfo.java
@@ -14,7 +14,6 @@ import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.LinkedList;
import java.util.List;
-import java.util.Map;
import java.util.TreeMap;
import java.util.logging.Logger;
@@ -74,11 +73,16 @@ abstract public class NodeInfo implements Comparable<NodeInfo> {
* Version 1 is for the getnodestate2 command ((legacy, not supported).
* Version 2 is for the getnodestate3 command
* Version 3 adds support for setdistributionstates
+ * Version 4 adds support for explicit cluster state version bundle activation
*/
private int version;
- private Map<Integer, ClusterState> systemStateVersionSent = new TreeMap<>();
- private ClusterState systemStateVersionAcknowledged;
+ // Mapping of cluster state version -> cluster state bundle instance
+ private TreeMap<Integer, ClusterStateBundle> clusterStateVersionBundleSent = new TreeMap<>();
+ private ClusterStateBundle clusterStateVersionBundleAcknowledged;
+
+ private int clusterStateVersionActivationSent = -1;
+ private int clusterStateVersionActivationAcked = -1;
/**
* When a node goes from an up state to a down state, update this flag with the start timestamp the node had before going down.
* The cluster state broadcaster will use this to identify whether distributors have restarted.
@@ -102,7 +106,9 @@ abstract public class NodeInfo implements Comparable<NodeInfo> {
// NOTE: See update(node) below
NodeInfo(ContentCluster cluster, Node n, boolean configuredRetired, String rpcAddress, Distribution distribution) {
- if (cluster == null) throw new IllegalArgumentException("Cluster not set");
+ if (cluster == null) {
+ throw new IllegalArgumentException("Cluster not set");
+ }
reportedState = new NodeState(n.getType(), State.DOWN);
wantedState = new NodeState(n.getType(), State.UP);
this.cluster = cluster;
@@ -238,7 +244,7 @@ abstract public class NodeInfo implements Comparable<NodeInfo> {
public ContentCluster getCluster() { return cluster; }
- /** Returns true if the node is currentl registered in slobrok */
+ /** Returns true if the node is currently registered in slobrok */
// FIXME why is this called "isRpcAddressOutdated" then???
public boolean isRpcAddressOutdated() { return lastSeenInSlobrok != null; }
@@ -353,12 +359,13 @@ abstract public class NodeInfo implements Comparable<NodeInfo> {
/** Sets the wanted state. The wanted state is taken as UP if a null argument is given */
public void setWantedState(NodeState state) {
- if (state == null)
+ if (state == null) {
state = new NodeState(node.getType(), State.UP);
+ }
NodeState newWanted = new NodeState(node.getType(), state.getState());
newWanted.setDescription(state.getDescription());
if (!newWanted.equals(state)) {
- try{
+ try {
throw new Exception();
} catch (Exception e) {
StringWriter sw = new StringWriter();
@@ -408,50 +415,70 @@ abstract public class NodeInfo implements Comparable<NodeInfo> {
}
public int getVersion() { return version; }
- public int getConnectionVersion() { return connectionVersion; }
- public void setConnectionVersion(int version) { connectionVersion = version; }
public ClusterState getNewestSystemStateSent() {
- ClusterState last = null;
- for (ClusterState s : systemStateVersionSent.values()) {
- if (last == null || last.getVersion() < s.getVersion()) {
- last = s;
- }
+ if (clusterStateVersionBundleSent.isEmpty()) {
+ return null;
}
- return last;
+ return clusterStateVersionBundleSent.lastEntry().getValue().getBaselineClusterState();
}
public int getNewestSystemStateVersionSent() {
ClusterState last = getNewestSystemStateSent();
return last == null ? -1 : last.getVersion();
}
- public int getSystemStateVersionAcknowledged() {
- return (systemStateVersionAcknowledged == null ? -1 : systemStateVersionAcknowledged.getVersion());
+
+ public int getClusterStateVersionBundleAcknowledged() {
+ return (clusterStateVersionBundleAcknowledged == null ? -1 : clusterStateVersionBundleAcknowledged.getVersion());
}
- public void setSystemStateVersionSent(ClusterState state) {
- if (state == null) throw new Error("Should not clear info for last version sent");
- if (systemStateVersionSent.containsKey(state.getVersion())) {
- throw new IllegalStateException("We have already sent cluster state version " + state.getVersion() + " to " + node);
+ public void setClusterStateVersionBundleSent(ClusterStateBundle stateBundle) {
+ if (stateBundle == null) {
+ throw new Error("Should not clear info for last version sent");
}
- systemStateVersionSent.put(state.getVersion(), state);
+ if (clusterStateVersionBundleSent.containsKey(stateBundle.getVersion())) {
+ throw new IllegalStateException("We have already sent cluster state version " + stateBundle.getVersion() + " to " + node);
+ }
+ clusterStateVersionBundleSent.put(stateBundle.getVersion(), stateBundle);
}
- public void setSystemStateVersionAcknowledged(Integer version, boolean success) {
- if (version == null) throw new Error("Should not clear info for last version acked");
- if (!systemStateVersionSent.containsKey(version)) {
+ public void setClusterStateBundleVersionAcknowledged(Integer version, boolean success) {
+ if (version == null) {
+ throw new Error("Should not clear info for last version acked");
+ }
+ if (!clusterStateVersionBundleSent.containsKey(version)) {
throw new IllegalStateException("Got response for cluster state " + version + " which is not tracked as pending for node " + node);
}
- ClusterState state = systemStateVersionSent.remove(version);
- if (success && (systemStateVersionAcknowledged == null || systemStateVersionAcknowledged.getVersion() < state.getVersion())) {
- systemStateVersionAcknowledged = state;
+ var stateBundle = clusterStateVersionBundleSent.remove(version);
+ if (success && (clusterStateVersionBundleAcknowledged == null || clusterStateVersionBundleAcknowledged.getVersion() < stateBundle.getVersion())) {
+ clusterStateVersionBundleAcknowledged = stateBundle;
if (wentDownWithStartTime != 0
- && (wentDownAtClusterState == null || wentDownAtClusterState.getVersion() < state.getVersion())
- && !state.getNodeState(node).getState().oneOf("dsm"))
+ && (wentDownAtClusterState == null || wentDownAtClusterState.getVersion() < stateBundle.getVersion())
+ && !stateBundle.getBaselineClusterState().getNodeState(node).getState().oneOf("dsm"))
{
- log.log(LogLevel.DEBUG, "Clearing going down timestamp of node " + node + " after receiving ack of cluster state " + state);
+ log.log(LogLevel.DEBUG, () -> String.format("Clearing going down timestamp of node %s after " +
+ "receiving ack of cluster state bundle %s", node, stateBundle));
wentDownWithStartTime = 0;
}
}
}
+ public void setClusterStateVersionActivationSent(int version) {
+ clusterStateVersionActivationSent = version;
+ }
+ public int getClusterStateVersionActivationSent() {
+ return clusterStateVersionActivationSent;
+ }
+
+ public int getClusterStateVersionActivationAcked() {
+ return clusterStateVersionActivationAcked;
+ }
+ public void setSystemStateVersionActivationAcked(Integer version, boolean success) {
+ if (success && (version > clusterStateVersionActivationAcked)) {
+ clusterStateVersionActivationAcked = version;
+ } else if (!success) {
+ clusterStateVersionActivationSent = -1; // Trigger resend
+ }
+ }
+
+
public void setHostInfo(HostInfo hostInfo) {
// Note: This will blank out any hostInfo we already had, if the parsing fails.
// This is intentional, to make sure we're never left with stale data.
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SetClusterStateRequest.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SetClusterStateRequest.java
index 836876b5642..d4e79a4f2b2 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SetClusterStateRequest.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SetClusterStateRequest.java
@@ -1,48 +1,10 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.clustercontroller.core;
-public abstract class SetClusterStateRequest {
-
- private final NodeInfo nodeInfo;
- private final int systemStateVersion;
- private Reply reply;
-
- public SetClusterStateRequest(NodeInfo nodeInfo, int systemStateVersion) {
- this.nodeInfo = nodeInfo;
- this.systemStateVersion = systemStateVersion;
- }
-
- public NodeInfo getNodeInfo() { return nodeInfo; }
-
- public int getSystemStateVersion() { return systemStateVersion; }
-
- public void setReply(Reply reply) { this.reply = reply; }
-
- public Reply getReply() { return reply; }
-
- public static class Reply {
-
- final int returnCode;
- final String returnMessage;
-
- public Reply() {
- this(0, null);
- }
-
- public Reply(int returnCode, String returnMessage) {
- this.returnCode = returnCode;
- this.returnMessage = returnMessage;
- }
-
- /** Returns whether this is an error response */
- public boolean isError() { return returnCode != 0; }
-
- /** Returns the return code, which is 0 if this request was successful */
- public int getReturnCode() { return returnCode; }
-
- /** Returns the message returned, or null if none */
- public String getReturnMessage() { return returnMessage; }
+public abstract class SetClusterStateRequest extends ClusterStateVersionSpecificRequest {
+ public SetClusterStateRequest(NodeInfo nodeInfo, int clusterStateVersion) {
+ super(nodeInfo, clusterStateVersion);
}
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateBroadcaster.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateBroadcaster.java
index 629800fb13c..5ecb57a1c76 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateBroadcaster.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateBroadcaster.java
@@ -1,6 +1,7 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.clustercontroller.core;
+import com.yahoo.jrt.ErrorCode;
import com.yahoo.log.LogLevel;
import com.yahoo.vdslib.state.*;
import com.yahoo.vespa.clustercontroller.core.database.DatabaseHandler;
@@ -19,13 +20,18 @@ public class SystemStateBroadcaster {
private final Timer timer;
private final Object monitor;
private ClusterStateBundle clusterStateBundle;
- private final List<SetClusterStateRequest> replies = new LinkedList<>();
+ private final List<SetClusterStateRequest> setClusterStateReplies = new LinkedList<>();
+ private final List<ActivateClusterStateVersionRequest> activateClusterStateVersionReplies = new LinkedList<>();
private final static long minTimeBetweenNodeErrorLogging = 10 * 60 * 1000;
private final Map<Node, Long> lastErrorReported = new TreeMap<>();
- private int lastClusterStateInSync = 0;
- private final ClusterStateWaiter waiter = new ClusterStateWaiter();
+ private int lastStateVersionBundleAcked = 0;
+ private int lastClusterStateVersionConverged = 0;
+ private ClusterStateBundle lastClusterStateBundleConverged;
+
+ private final SetClusterStateWaiter setClusterStateWaiter = new SetClusterStateWaiter();
+ private final ActivateClusterStateVersionWaiter activateClusterStateVersionWaiter = new ActivateClusterStateVersionWaiter();
public SystemStateBroadcaster(Timer timer, Object monitor) {
this.timer = timer;
@@ -52,95 +58,204 @@ public class SystemStateBroadcaster {
return clusterStateBundle;
}
+ public ClusterStateBundle getLastClusterStateBundleConverged() {
+ return lastClusterStateBundleConverged;
+ }
+
private void reportNodeError(boolean nodeOk, NodeInfo info, String message) {
long time = timer.getCurrentTimeInMillis();
Long lastReported = lastErrorReported.get(info.getNode());
boolean alreadySeen = (lastReported != null && time - lastReported < minTimeBetweenNodeErrorLogging);
- log.log(nodeOk && !alreadySeen ? LogLevel.WARNING : LogLevel.DEBUG, message);
- if (!alreadySeen) lastErrorReported.put(info.getNode(), time);
+ log.log((nodeOk && !alreadySeen) ? LogLevel.WARNING : LogLevel.DEBUG, message);
+ if (!alreadySeen) {
+ lastErrorReported.put(info.getNode(), time);
+ }
}
public boolean processResponses() {
boolean anyResponsesFound = false;
synchronized(monitor) {
- for(SetClusterStateRequest req : replies) {
- anyResponsesFound = true;
-
- NodeInfo info = req.getNodeInfo();
- boolean nodeOk = info.getReportedState().getState().oneOf("uir");
- int version = req.getSystemStateVersion();
-
- if (req.getReply().isError()) {
- info.setSystemStateVersionAcknowledged(version, false);
- if (req.getReply().getReturnCode() != Communicator.TRANSIENT_ERROR) {
- if (info.getNewestSystemStateVersionSent() == version) {
- reportNodeError(nodeOk, info,
- "Got error response " + req.getReply().getReturnCode() + ": " + req.getReply().getReturnMessage()
- + " from " + info + " setsystemstate request.");
- }
- }
+ anyResponsesFound = !setClusterStateReplies.isEmpty() || !activateClusterStateVersionReplies.isEmpty();
+ processSetClusterStateResponses();
+ processActivateClusterStateVersionResponses();
+ }
+ return anyResponsesFound;
+ }
+
+ private void processActivateClusterStateVersionResponses() {
+ for (var req : activateClusterStateVersionReplies) {
+ NodeInfo info = req.getNodeInfo();
+ int version = req.getClusterStateVersion();
+ boolean success = true;
+ var reply = req.getReply();
+ if (reply.isError()) {
+ // NO_SUCH_METHOD implies node is on a version that does not understand explicit activations
+ // and it has already merrily started using the state version. Treat as if it had been ACKed.
+ if (reply.getReturnCode() != ErrorCode.NO_SUCH_METHOD) {
+ log.log(LogLevel.DEBUG, () -> String.format("Activation NACK for node %s with version %d, message %s",
+ info, version, reply.getReturnMessage()));
+ success = false;
} else {
- info.setSystemStateVersionAcknowledged(version, true);
- log.log(LogLevel.DEBUG, "Node " + info + " acked system state version " + version + ".");
- lastErrorReported.remove(info.getNode());
+ log.log(LogLevel.DEBUG, () -> String.format("Node %s did not understand state activation RPC; " +
+ "implicitly treating state %d as activated on node", info, version));
}
+ } else if (reply.getActualVersion() != version) {
+ boolean nodeOk = nodeReportsSelfAsAvailable(info);
+ // Avoid spamming the logs since this will happen on all resends until (presumably) the controller
+ // loses election status.
+ // TODO this should trigger a loss of current controller's leadership!
+ reportNodeError(nodeOk, info, String.format("Activation of version %d did not take effect, node %s " +
+ "reports it has an actual pending version of %d. Racing with another controller?",
+ version, info, reply.getActualVersion()));
+ success = false;
+ } else {
+ log.log(LogLevel.DEBUG, () -> String.format("Node %s reports successful activation of state " +
+ "version %d", info, version));
}
- replies.clear();
+ info.setSystemStateVersionActivationAcked(version, success);
+ // TODO we currently don't invoke reportNodeError here.. We assume that node errors will be reported
+ // as part of processSetClusterStateResponses anyway, but can add it here as well if deemed necessary.
}
- return anyResponsesFound;
+ activateClusterStateVersionReplies.clear();
}
- private boolean nodeNeedsClusterState(NodeInfo node) {
- if (node.getSystemStateVersionAcknowledged() == clusterStateBundle.getVersion()) {
- return false; // No point in sending if node already has updated system state
+ private static boolean nodeReportsSelfAsAvailable(NodeInfo info) {
+ return info.getReportedState().getState().oneOf("uir");
+ }
+
+ private void processSetClusterStateResponses() {
+ for (SetClusterStateRequest req : setClusterStateReplies) {
+ NodeInfo info = req.getNodeInfo();
+ int version = req.getClusterStateVersion();
+
+ if (req.getReply().isError()) {
+ info.setClusterStateBundleVersionAcknowledged(version, false);
+ if (req.getReply().getReturnCode() != Communicator.TRANSIENT_ERROR) {
+ if (info.getNewestSystemStateVersionSent() == version) {
+ boolean nodeOk = nodeReportsSelfAsAvailable(info);
+ reportNodeError(nodeOk, info,
+ String.format("Got error response %d: %s from %s setdistributionstates request.",
+ req.getReply().getReturnCode(), req.getReply().getReturnMessage(), info));
+ }
+ }
+ } else {
+ info.setClusterStateBundleVersionAcknowledged(version, true);
+ log.log(LogLevel.DEBUG, () -> String.format("Node %s ACKed system state version %d.", info, version));
+ lastErrorReported.remove(info.getNode());
+ }
}
+ setClusterStateReplies.clear();
+ }
+
+ private static boolean nodeIsReachable(NodeInfo node) {
if (node.getRpcAddress() == null || node.isRpcAddressOutdated()) {
return false; // Can't set state on nodes we don't know where are
}
if (node.getReportedState().getState() == State.MAINTENANCE ||
- node.getReportedState().getState() == State.DOWN ||
- node.getReportedState().getState() == State.STOPPING)
+ node.getReportedState().getState() == State.DOWN ||
+ node.getReportedState().getState() == State.STOPPING)
{
return false; // No point in sending system state to nodes that can't receive messages or don't want them
}
return true;
}
+ private boolean nodeNeedsClusterStateBundle(NodeInfo node) {
+ if (node.getClusterStateVersionBundleAcknowledged() == clusterStateBundle.getVersion()) {
+ return false; // No point in sending if node already has updated system state
+ }
+ return nodeIsReachable(node);
+ }
+
+ private boolean nodeNeedsClusterStateActivation(NodeInfo node) {
+ if (node.getClusterStateVersionActivationAcked() == clusterStateBundle.getVersion()) {
+ return false; // No point in sending if node already has activated cluster state version
+ }
+ return nodeIsReachable(node);
+ }
+
private List<NodeInfo> resolveStateVersionSendSet(DatabaseHandler.Context dbContext) {
return dbContext.getCluster().getNodeInfo().stream()
- .filter(this::nodeNeedsClusterState)
- .filter(node -> !newestStateAlreadySentToNode(node))
+ .filter(this::nodeNeedsClusterStateBundle)
+ .filter(node -> !newestStateBundleAlreadySentToNode(node))
+ .collect(Collectors.toList());
+ }
+
+ // Precondition: no nodes in the cluster need to receive the current cluster state version bundle
+ private List<NodeInfo> resolveStateActivationSendSet(DatabaseHandler.Context dbContext) {
+ return dbContext.getCluster().getNodeInfo().stream()
+ .filter(this::nodeNeedsClusterStateActivation)
+ .filter(node -> !newestStateActivationAlreadySentToNode(node))
.collect(Collectors.toList());
}
- private boolean newestStateAlreadySentToNode(NodeInfo node) {
+ private boolean newestStateBundleAlreadySentToNode(NodeInfo node) {
return (node.getNewestSystemStateVersionSent() == clusterStateBundle.getVersion());
}
+ private boolean newestStateActivationAlreadySentToNode(NodeInfo node) {
+ return (node.getClusterStateVersionActivationSent() == clusterStateBundle.getVersion());
+ }
+
/**
- * Checks if all distributor nodes have ACKed the most recent cluster state. Iff this
- * is the case, triggers handleAllDistributorsInSync() on the provided FleetController
+ * Checks if all distributor nodes have ACKed (and activated) the most recent cluster state.
+ * Iff this is the case, triggers handleAllDistributorsInSync() on the provided FleetController
* object and updates the broadcaster's last known in-sync cluster state version.
*/
void checkIfClusterStateIsAckedByAllDistributors(DatabaseHandler database,
- DatabaseHandler.Context dbContext,
- FleetController fleetController) throws InterruptedException {
- if ((clusterStateBundle == null) || (lastClusterStateInSync == clusterStateBundle.getVersion())) {
+ DatabaseHandler.Context dbContext,
+ FleetController fleetController) throws InterruptedException {
+ if ((clusterStateBundle == null) || currentClusterStateIsConverged()) {
return; // Nothing to do for the current state
}
final int currentStateVersion = clusterStateBundle.getVersion();
- boolean anyOutdatedDistributorNodes = dbContext.getCluster().getNodeInfo().stream()
+ boolean anyDistributorsNeedStateBundle = dbContext.getCluster().getNodeInfo().stream()
.filter(NodeInfo::isDistributor)
- .anyMatch(this::nodeNeedsClusterState);
+ .anyMatch(this::nodeNeedsClusterStateBundle);
- if (!anyOutdatedDistributorNodes && (currentStateVersion > lastClusterStateInSync)) {
- log.log(LogLevel.DEBUG, "All distributors have newest clusterstate, updating start timestamps in zookeeper and clearing them from cluster state");
- lastClusterStateInSync = currentStateVersion;
- fleetController.handleAllDistributorsInSync(database, dbContext);
+ if (!anyDistributorsNeedStateBundle && (currentStateVersion > lastStateVersionBundleAcked)) {
+ markCurrentClusterStateBundleAsReceivedByAllDistributors();
+ if (clusterStateBundle.deferredActivation()) {
+ log.log(LogLevel.DEBUG, () -> String.format("All distributors have ACKed cluster state " +
+ "version %d, sending activation", currentStateVersion));
+ } else {
+ markCurrentClusterStateAsConverged(database, dbContext, fleetController);
+ }
+ return; // Either converged (no two-phase) or activations must be sent before we can continue.
+ }
+
+ if (anyDistributorsNeedStateBundle || !clusterStateBundle.deferredActivation()) {
+ return;
}
+
+ boolean anyDistributorsNeedActivation = dbContext.getCluster().getNodeInfo().stream()
+ .filter(NodeInfo::isDistributor)
+ .anyMatch(this::nodeNeedsClusterStateActivation);
+
+ if (!anyDistributorsNeedActivation && (currentStateVersion > lastClusterStateVersionConverged)) {
+ markCurrentClusterStateAsConverged(database, dbContext, fleetController);
+ } else {
+ log.log(LogLevel.DEBUG, () -> String.format("distributors still need activation in state %d (last converged: %d)",
+ currentStateVersion, lastClusterStateVersionConverged));
+ }
+ }
+
+ private void markCurrentClusterStateBundleAsReceivedByAllDistributors() {
+ lastStateVersionBundleAcked = clusterStateBundle.getVersion();
}
- public boolean broadcastNewState(DatabaseHandler.Context dbContext, Communicator communicator) {
+ private void markCurrentClusterStateAsConverged(DatabaseHandler database, DatabaseHandler.Context dbContext, FleetController fleetController) throws InterruptedException {
+ log.log(LogLevel.DEBUG, "All distributors have newest clusterstate, updating start timestamps in zookeeper and clearing them from cluster state");
+ lastClusterStateVersionConverged = clusterStateBundle.getVersion();
+ lastClusterStateBundleConverged = clusterStateBundle;
+ fleetController.handleAllDistributorsInSync(database, dbContext);
+ }
+
+ private boolean currentClusterStateIsConverged() {
+ return lastClusterStateVersionConverged == clusterStateBundle.getVersion();
+ }
+
+ public boolean broadcastNewStateBundleIfRequired(DatabaseHandler.Context dbContext, Communicator communicator) {
if (clusterStateBundle == null) {
return false;
}
@@ -157,20 +272,44 @@ public class SystemStateBroadcaster {
if (nodeNeedsToObserveStartupTimestamps(node)) {
// TODO this is the same for all nodes, compute only once
ClusterStateBundle modifiedBundle = clusterStateBundle.cloneWithMapper(state -> buildModifiedClusterState(state, dbContext));
- log.log(LogLevel.DEBUG, "Sending modified cluster state version " + baselineState.getVersion()
- + " to node " + node + ": " + modifiedBundle);
- communicator.setSystemState(modifiedBundle, node, waiter);
+ log.log(LogLevel.DEBUG, () -> String.format("Sending modified cluster state version %d" +
+ " to node %s: %s", baselineState.getVersion(), node, modifiedBundle));
+ communicator.setSystemState(modifiedBundle, node, setClusterStateWaiter);
} else {
- log.log(LogLevel.DEBUG, "Sending system state version " + baselineState.getVersion() + " to node " + node
- + ". (went down time " + node.getWentDownWithStartTime() + ", node start time " + node.getStartTimestamp() + ")");
- communicator.setSystemState(clusterStateBundle, node, waiter);
+ log.log(LogLevel.DEBUG, () -> String.format("Sending system state version %d to node %s. " +
+ "(went down time %d, node start time %d)", baselineState.getVersion(), node,
+ node.getWentDownWithStartTime(), node.getStartTimestamp()));
+ communicator.setSystemState(clusterStateBundle, node, setClusterStateWaiter);
}
}
return !recipients.isEmpty();
}
- public int lastClusterStateVersionInSync() { return lastClusterStateInSync; }
+ public boolean broadcastStateActivationsIfRequired(DatabaseHandler.Context dbContext, Communicator communicator) {
+ if (clusterStateBundle == null || !clusterStateBundle.getBaselineClusterState().isOfficial()) {
+ return false;
+ }
+
+ if (!clusterStateBundle.deferredActivation() || !allDistributorsHaveAckedSentClusterStateBundle()) {
+ return false;
+ }
+
+ var recipients = resolveStateActivationSendSet(dbContext);
+ for (NodeInfo node : recipients) {
+ log.log(LogLevel.DEBUG, () -> String.format("Sending cluster state activation to node %s for version %d",
+ node, clusterStateBundle.getVersion()));
+ communicator.activateClusterStateVersion(clusterStateBundle.getVersion(), node, activateClusterStateVersionWaiter);
+ }
+
+ return !recipients.isEmpty();
+ }
+
+ private boolean allDistributorsHaveAckedSentClusterStateBundle() {
+ return (lastStateVersionBundleAcked == clusterStateBundle.getVersion());
+ }
+
+ public int lastClusterStateVersionInSync() { return lastClusterStateVersionConverged; }
private static boolean nodeNeedsToObserveStartupTimestamps(NodeInfo node) {
return node.getStartTimestamp() != 0 && node.getWentDownWithStartTime() == node.getStartTimestamp();
@@ -188,11 +327,20 @@ public class SystemStateBroadcaster {
return newState;
}
- private class ClusterStateWaiter implements Communicator.Waiter<SetClusterStateRequest> {
+ private class SetClusterStateWaiter implements Communicator.Waiter<SetClusterStateRequest> {
@Override
public void done(SetClusterStateRequest reply) {
synchronized (monitor) {
- replies.add(reply);
+ setClusterStateReplies.add(reply);
+ }
+ }
+ }
+
+ private class ActivateClusterStateVersionWaiter implements Communicator.Waiter<ActivateClusterStateVersionRequest> {
+ @Override
+ public void done(ActivateClusterStateVersionRequest reply) {
+ synchronized (monitor) {
+ activateClusterStateVersionReplies.add(reply);
}
}
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/listeners/SystemStateListener.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/listeners/SystemStateListener.java
index 764bb3a0d92..a0d53e8c93e 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/listeners/SystemStateListener.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/listeners/SystemStateListener.java
@@ -8,6 +8,14 @@ public interface SystemStateListener {
// TODO consider rename to bundle
void handleNewPublishedState(ClusterStateBundle states);
+ /**
+ * Invoked at the edge when all pending cluster state bundles and version activations
+ * have been successfully ACKed by all distributors in the cluster.
+ *
+ * @param states bundle that has converged across all distributors
+ */
+ default void handleStateConvergedInCluster(ClusterStateBundle states) {}
+
default void handleNewCandidateState(ClusterStateBundle states) {}
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/RPCActivateClusterStateVersionRequest.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/RPCActivateClusterStateVersionRequest.java
new file mode 100644
index 00000000000..c2f48ccf589
--- /dev/null
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/RPCActivateClusterStateVersionRequest.java
@@ -0,0 +1,20 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.clustercontroller.core.rpc;
+
+import com.yahoo.jrt.Request;
+import com.yahoo.vespa.clustercontroller.core.ActivateClusterStateVersionRequest;
+import com.yahoo.vespa.clustercontroller.core.NodeInfo;
+
+/**
+ * FRT RPC state implementation of a single cluster state activation request.
+ */
+public class RPCActivateClusterStateVersionRequest extends ActivateClusterStateVersionRequest {
+
+ Request request;
+
+ public RPCActivateClusterStateVersionRequest(NodeInfo nodeInfo, Request request, int clusterStateVersion) {
+ super(nodeInfo, clusterStateVersion);
+ this.request = request;
+ }
+
+}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/RPCActivateClusterStateVersionWaiter.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/RPCActivateClusterStateVersionWaiter.java
new file mode 100644
index 00000000000..175a0b50cd6
--- /dev/null
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/RPCActivateClusterStateVersionWaiter.java
@@ -0,0 +1,47 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.clustercontroller.core.rpc;
+
+import com.yahoo.jrt.ErrorCode;
+import com.yahoo.jrt.Request;
+import com.yahoo.jrt.RequestWaiter;
+import com.yahoo.vespa.clustercontroller.core.ActivateClusterStateVersionRequest;
+import com.yahoo.vespa.clustercontroller.core.Communicator;
+import com.yahoo.vespa.clustercontroller.core.NodeInfo;
+import com.yahoo.vespa.clustercontroller.core.Timer;
+
+/**
+ * Binds together the reply received for a particular cluster state activation RPC and
+ * the cluster controller-internal callback handler which expects to receive it.
+ */
+public class RPCActivateClusterStateVersionWaiter implements RequestWaiter {
+
+ private final Communicator.Waiter<ActivateClusterStateVersionRequest> waiter;
+ private ActivateClusterStateVersionRequest request;
+
+ public RPCActivateClusterStateVersionWaiter(Communicator.Waiter<ActivateClusterStateVersionRequest> waiter) {
+ this.waiter = waiter;
+ }
+
+ public void setRequest(RPCActivateClusterStateVersionRequest request) {
+ this.request = request;
+ }
+
+ public ActivateClusterStateVersionRequest.Reply getReply(Request req) {
+ NodeInfo info = request.getNodeInfo();
+ if (req.isError()) {
+ return new ActivateClusterStateVersionRequest.Reply(req.errorCode(), req.errorMessage());
+ } else if (!req.checkReturnTypes("i")) {
+ return new ActivateClusterStateVersionRequest.Reply(ErrorCode.BAD_REPLY, "Got RPC response with invalid return types from " + info);
+ }
+ int actualVersion = req.returnValues().get(0).asInt32();
+ return ActivateClusterStateVersionRequest.Reply.withActualVersion(actualVersion);
+ }
+
+ @Override
+ public void handleRequestDone(Request request) {
+ ActivateClusterStateVersionRequest.Reply reply = getReply(request);
+ this.request.setReply(reply);
+ waiter.done(this.request);
+ }
+
+}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/RPCCommunicator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/RPCCommunicator.java
index 9089da68e10..c3c5c9e3b98 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/RPCCommunicator.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/RPCCommunicator.java
@@ -15,6 +15,7 @@ import com.yahoo.vdslib.state.NodeState;
import com.yahoo.vdslib.state.ClusterState;
import com.yahoo.vdslib.state.State;
import com.yahoo.log.LogLevel;
+import com.yahoo.vespa.clustercontroller.core.ActivateClusterStateVersionRequest;
import com.yahoo.vespa.clustercontroller.core.ClusterStateBundle;
import com.yahoo.vespa.clustercontroller.core.Communicator;
import com.yahoo.vespa.clustercontroller.core.FleetControllerOptions;
@@ -35,6 +36,9 @@ public class RPCCommunicator implements Communicator {
public static final Logger log = Logger.getLogger(RPCCommunicator.class.getName());
+ public static final int ACTIVATE_CLUSTER_STATE_VERSION_RPC_VERSION = 4;
+ public static final String ACTIVATE_CLUSTER_STATE_VERSION_RPC_METHOD_NAME = "activate_cluster_state_version";
+
public static final int SET_DISTRIBUTION_STATES_RPC_VERSION = 3;
public static final String SET_DISTRIBUTION_STATES_RPC_METHOD_NAME = "setdistributionstates";
@@ -106,7 +110,7 @@ public class RPCCommunicator implements Communicator {
public void getNodeState(NodeInfo node, Waiter<GetNodeStateRequest> externalWaiter) {
Target connection = getConnection(node);
if ( ! connection.isValid()) {
- log.log(LogLevel.DEBUG, "Connection to " + node.getRpcAddress() + " could not be created.");
+ log.log(LogLevel.DEBUG, () -> String.format("Connection to '%s' could not be created.", node.getRpcAddress()));
}
NodeState currentState = node.getReportedState();
Request req = new Request("getnodestate3");
@@ -134,7 +138,7 @@ public class RPCCommunicator implements Communicator {
Target connection = getConnection(node);
if ( ! connection.isValid()) {
- log.log(LogLevel.DEBUG, "Connection to " + node.getRpcAddress() + " could not be created.");
+ log.log(LogLevel.DEBUG, () -> String.format("Connection to '%s' could not be created.", node.getRpcAddress()));
return;
}
int nodeVersion = node.getVersion();
@@ -158,7 +162,29 @@ public class RPCCommunicator implements Communicator {
waiter.setRequest(stateRequest);
connection.invokeAsync(req, 60, waiter);
- node.setSystemStateVersionSent(baselineState);
+ node.setClusterStateVersionBundleSent(stateBundle);
+ }
+
+ @Override
+ public void activateClusterStateVersion(int clusterStateVersion, NodeInfo node, Waiter<ActivateClusterStateVersionRequest> externalWaiter) {
+ var waiter = new RPCActivateClusterStateVersionWaiter(externalWaiter);
+
+ Target connection = getConnection(node);
+ if ( ! connection.isValid()) {
+ log.log(LogLevel.DEBUG, () -> String.format("Connection to '%s' could not be created.", node.getRpcAddress()));
+ return;
+ }
+
+ var req = new Request(ACTIVATE_CLUSTER_STATE_VERSION_RPC_METHOD_NAME);
+ req.parameters().add(new Int32Value(clusterStateVersion));
+
+ log.log(LogLevel.DEBUG, () -> String.format("Sending '%s' RPC to %s for state version %d",
+ req.methodName(), node.getRpcAddress(), clusterStateVersion));
+ var activationRequest = new RPCActivateClusterStateVersionRequest(node, req, clusterStateVersion);
+ waiter.setRequest(activationRequest);
+
+ connection.invokeAsync(req, 60, waiter);
+ node.setClusterStateVersionActivationSent(clusterStateVersion);
}
// protected for testing.
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/SlimeClusterStateBundleCodec.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/SlimeClusterStateBundleCodec.java
index 1c391f9aacf..cb76f67038c 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/SlimeClusterStateBundleCodec.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/SlimeClusterStateBundleCodec.java
@@ -30,6 +30,9 @@ public class SlimeClusterStateBundleCodec implements ClusterStateBundleCodec, En
public EncodedClusterStateBundle encode(ClusterStateBundle stateBundle) {
Slime slime = new Slime();
Cursor root = slime.setObject();
+ if (stateBundle.deferredActivation()) {
+ root.setBool("deferred-activation", stateBundle.deferredActivation());
+ }
Cursor states = root.setObject("states");
// TODO add another function that is not toString for this..!
states.setString("baseline", stateBundle.getBaselineClusterState().toString());
@@ -55,8 +58,9 @@ public class SlimeClusterStateBundleCodec implements ClusterStateBundleCodec, En
spaces.traverse(((ObjectTraverser)(key, value) -> {
derivedStates.put(key, AnnotatedClusterState.withoutAnnotations(ClusterState.stateFromString(value.asString())));
}));
+ boolean deferredActivation = root.field("deferred-activation").asBool(); // defaults to false if not present
- return ClusterStateBundle.of(AnnotatedClusterState.withoutAnnotations(baseline), derivedStates);
+ return ClusterStateBundle.of(AnnotatedClusterState.withoutAnnotations(baseline), derivedStates, deferredActivation);
}
// Technically the Slime enveloping could be its own class that is bundle codec independent, but
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java
index 07195d05aa8..6d11c4f1239 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java
@@ -65,7 +65,7 @@ public class LegacyIndexPageRequestHandler implements StatusPageServer.RequestHa
new VdsClusterHtmlRendrer(),
content,
timer,
- stateVersionTracker.getVersionedClusterState(),
+ stateVersionTracker.getVersionedClusterStateBundle(),
stateVersionTracker.getAggregatedClusterStats(),
data.getOptions().storageDistribution,
data.getOptions(),
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/statuspage/VdsClusterHtmlRendrer.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/statuspage/VdsClusterHtmlRendrer.java
index 551eb34f8fa..0daa9ff7d37 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/statuspage/VdsClusterHtmlRendrer.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/statuspage/VdsClusterHtmlRendrer.java
@@ -64,7 +64,7 @@ public class VdsClusterHtmlRendrer {
final TreeMap<Integer, NodeInfo> storageNodeInfos,
final TreeMap<Integer, NodeInfo> distributorNodeInfos,
final Timer timer,
- final ClusterState state,
+ final ClusterStateBundle state,
final ClusterStatsAggregator statsAggregator,
final double minMergeCompletionRatio,
final int maxPrematureCrashes,
@@ -161,7 +161,7 @@ public class VdsClusterHtmlRendrer {
final TreeMap<Integer, NodeInfo> nodeInfos,
final NodeType nodeType,
final Timer timer,
- final ClusterState state,
+ final ClusterStateBundle stateBundle,
final ClusterStatsAggregator statsAggregator,
final double minMergeCompletionRatio,
final int maxPrematureCrashes,
@@ -169,145 +169,180 @@ public class VdsClusterHtmlRendrer {
final String pathPrefix,
final String dominantVtag,
final String name) {
+ final ClusterState state = stateBundle.getBaselineClusterState();
final long currentTime = timer.getCurrentTimeInMillis();
addTableHeader(name, nodeType);
for (final NodeInfo nodeInfo : nodeInfos.values()) {
HtmlTable.Row row = new HtmlTable.Row();
+ long timeSinceContact = nodeInfo.getTimeOfFirstFailingConnectionAttempt() == 0
+ ? 0 : currentTime - nodeInfo.getTimeOfFirstFailingConnectionAttempt();
- // Add node index
- row.addCell(new HtmlTable.Cell("<a href=\"" + pathPrefix + "/node=" + nodeInfo.getNode()
- + "\">" + nodeInfo.getNodeIndex() + "</a>"));
+ addNodeIndex(pathPrefix, nodeInfo, row);
+ addReportedState(nodeInfo, row);
+ addWantedState(nodeInfo, row);
+ addCurrentState(state, nodeInfo, row);
+ addBuildTagVersion(dominantVtag, nodeInfo, row);
+ addFailedConnectionAttemptCount(nodeInfo, row, timeSinceContact);
+ addTimeSinceFirstFailing(nodeInfo, row, timeSinceContact);
+ addStatePendingTime(currentTime, nodeInfo, row);
+ addClusterStateVersion(stateBundle, nodeInfo, row);
+ addPrematureCrashes(maxPrematureCrashes, nodeInfo, row);
+ addEventsLastWeek(eventLog, currentTime, nodeInfo, row);
+ addBucketSpacesStats(nodeType, statsAggregator, minMergeCompletionRatio, nodeInfo, row);
+ addStartTime(nodeInfo, row);
+ addRpcAddress(nodeInfo, row);
- // Add reported state
- NodeState reportedState = nodeInfo.getReportedState().clone().setStartTimestamp(0);
- row.addCell(new HtmlTable.Cell(HtmlTable.escape(reportedState.toString(true))));
- if (!nodeInfo.getReportedState().getState().equals(State.UP)) {
+ table.addRow(row);
+ }
+ }
+
+ private void addRpcAddress(NodeInfo nodeInfo, HtmlTable.Row row) {
+ if (nodeInfo.getRpcAddress() == null) {
+ row.addCell(new HtmlTable.Cell("-").addProperties(ERROR_PROPERTY));
+ } else {
+ row.addCell(new HtmlTable.Cell(HtmlTable.escape(nodeInfo.getRpcAddress())));
+ if (nodeInfo.isRpcAddressOutdated()) {
row.getLastCell().addProperties(WARNING_PROPERTY);
}
+ }
+ }
- // Add wanted state
- if (nodeInfo.getWantedState() == null || nodeInfo.getWantedState().getState().equals(State.UP)) {
- row.addCell(new HtmlTable.Cell("-").addProperties(CENTERED_PROPERTY));
- } else {
- row.addCell(new HtmlTable.Cell(HtmlTable.escape(nodeInfo.getWantedState().toString(true))));
- if (nodeInfo.getWantedState().toString(true).indexOf("Disabled by fleet controller") != -1) {
- row.getLastCell().addProperties(ERROR_PROPERTY);
- } else {
- row.getLastCell().addProperties(WARNING_PROPERTY);
- }
- }
+ private void addStartTime(NodeInfo nodeInfo, HtmlTable.Row row) {
+ if (nodeInfo.getStartTimestamp() == 0) {
+ row.addCell(new HtmlTable.Cell("-").addProperties(ERROR_PROPERTY).addProperties(CENTERED_PROPERTY));
+ } else {
+ String startTime = RealTimer.printDateNoMilliSeconds(
+ 1000 * nodeInfo.getStartTimestamp(), utcTimeZone);
+ row.addCell(new HtmlTable.Cell(HtmlTable.escape(startTime)));
+ }
+ }
- // Add current state
- NodeState ns = state.getNodeState(nodeInfo.getNode()).clone().setDescription("").setMinUsedBits(16);
- if (state.getClusterState().oneOf("uir")) {
- row.addCell(new HtmlTable.Cell(HtmlTable.escape(ns.toString(true))));
- if (ns.getState().equals(State.DOWN)) {
- row.getLastCell().addProperties(ERROR_PROPERTY);
- } else if (ns.getState().oneOf("mi")) {
- row.getLastCell().addProperties(WARNING_PROPERTY);
- }
- } else {
- row.addCell(new HtmlTable.Cell("Cluster " +
- state.getClusterState().name().toLowerCase()).addProperties(ERROR_PROPERTY));
- }
+ private void addBucketSpacesStats(NodeType nodeType, ClusterStatsAggregator statsAggregator, double minMergeCompletionRatio, NodeInfo nodeInfo, HtmlTable.Row row) {
+ if (nodeType.equals(NodeType.STORAGE)) {
+ addBucketStats(row, getStatsForContentNode(statsAggregator, nodeInfo, FixedBucketSpaces.defaultSpace()),
+ minMergeCompletionRatio);
+ addBucketStats(row, getStatsForContentNode(statsAggregator, nodeInfo, FixedBucketSpaces.globalSpace()),
+ minMergeCompletionRatio);
+ } else {
+ addBucketStats(row, getStatsForDistributorNode(statsAggregator, nodeInfo, FixedBucketSpaces.defaultSpace()),
+ minMergeCompletionRatio);
+ addBucketStats(row, getStatsForDistributorNode(statsAggregator, nodeInfo, FixedBucketSpaces.globalSpace()),
+ minMergeCompletionRatio);
+ }
+ }
- // Add build tag version.
- final String buildTagText =
- nodeInfo.getVtag() != null
- ? nodeInfo.getVtag()
- : TAG_NOT_SET;
- row.addCell(new HtmlTable.Cell(buildTagText));
- if (! dominantVtag.equals(nodeInfo.getVtag())) {
- row.getLastCell().addProperties(WARNING_PROPERTY);
- }
+ private void addEventsLastWeek(EventLog eventLog, long currentTime, NodeInfo nodeInfo, HtmlTable.Row row) {
+ int nodeEvents = eventLog.getNodeEventsSince(nodeInfo.getNode(),
+ currentTime - eventLog.getRecentTimePeriod());
+ row.addCell(new HtmlTable.Cell("" + nodeEvents));
+ if (nodeEvents > 20) {
+ row.getLastCell().addProperties(ERROR_PROPERTY);
+ } else if (nodeEvents > 3) {
+ row.getLastCell().addProperties(WARNING_PROPERTY);
+ }
+ }
- // Add failed connection attempt count
- row.addCell(new HtmlTable.Cell("" + nodeInfo.getConnectionAttemptCount()));
- long timeSinceContact = nodeInfo.getTimeOfFirstFailingConnectionAttempt() == 0
- ? 0 : currentTime - nodeInfo.getTimeOfFirstFailingConnectionAttempt();
- if (timeSinceContact > 60 * 1000) {
- row.getLastCell().addProperties(ERROR_PROPERTY);
- } else if (nodeInfo.getConnectionAttemptCount() > 0) {
- row.getLastCell().addProperties(WARNING_PROPERTY);
- }
+ private void addPrematureCrashes(int maxPrematureCrashes, NodeInfo nodeInfo, HtmlTable.Row row) {
+ row.addCell(new HtmlTable.Cell("" + nodeInfo.getPrematureCrashCount()));
+ if (nodeInfo.getPrematureCrashCount() >= maxPrematureCrashes) {
+ row.getLastCell().addProperties(ERROR_PROPERTY);
+ } else if (nodeInfo.getPrematureCrashCount() > 0) {
+ row.getLastCell().addProperties(WARNING_PROPERTY);
+ }
+ }
- // Add time since first failing
- row.addCell(new HtmlTable.Cell((timeSinceContact / 1000) + " s"));
- if (timeSinceContact > 60 * 1000) {
- row.getLastCell().addProperties(ERROR_PROPERTY);
- } else if (nodeInfo.getConnectionAttemptCount() > 0) {
- row.getLastCell().addProperties(WARNING_PROPERTY);
- }
+ private void addClusterStateVersion(ClusterStateBundle state, NodeInfo nodeInfo, HtmlTable.Row row) {
+ String cellContent = (nodeInfo.getClusterStateVersionActivationAcked() == state.getVersion() || !state.deferredActivation())
+ ? String.format("%d", nodeInfo.getClusterStateVersionBundleAcknowledged())
+ : String.format("%d (%d)", nodeInfo.getClusterStateVersionBundleAcknowledged(),
+ nodeInfo.getClusterStateVersionActivationAcked());
+ row.addCell(new HtmlTable.Cell(cellContent));
+ if (nodeInfo.getClusterStateVersionBundleAcknowledged() < state.getVersion() - 2) {
+ row.getLastCell().addProperties(ERROR_PROPERTY);
+ } else if (nodeInfo.getClusterStateVersionBundleAcknowledged() < state.getVersion()) {
+ row.getLastCell().addProperties(WARNING_PROPERTY);
+ }
+ }
- // State pending time
- if (nodeInfo.getLatestNodeStateRequestTime() == null) {
- row.addCell(new HtmlTable.Cell("-").addProperties(CENTERED_PROPERTY));
- } else {
- row.addCell(new HtmlTable.Cell(HtmlTable.escape(RealTimer.printDuration(
- currentTime - nodeInfo.getLatestNodeStateRequestTime()))));
- }
+ private void addStatePendingTime(long currentTime, NodeInfo nodeInfo, HtmlTable.Row row) {
+ if (nodeInfo.getLatestNodeStateRequestTime() == null) {
+ row.addCell(new HtmlTable.Cell("-").addProperties(CENTERED_PROPERTY));
+ } else {
+ row.addCell(new HtmlTable.Cell(HtmlTable.escape(RealTimer.printDuration(
+ currentTime - nodeInfo.getLatestNodeStateRequestTime()))));
+ }
+ }
- // System state version
- row.addCell(new HtmlTable.Cell("" + nodeInfo.getSystemStateVersionAcknowledged()));
- if (nodeInfo.getSystemStateVersionAcknowledged() < state.getVersion() - 2) {
- row.getLastCell().addProperties(ERROR_PROPERTY);
- } else if (nodeInfo.getSystemStateVersionAcknowledged() < state.getVersion()) {
- row.getLastCell().addProperties(WARNING_PROPERTY);
- }
+ private void addTimeSinceFirstFailing(NodeInfo nodeInfo, HtmlTable.Row row, long timeSinceContact) {
+ row.addCell(new HtmlTable.Cell((timeSinceContact / 1000) + " s"));
+ if (timeSinceContact > 60 * 1000) {
+ row.getLastCell().addProperties(ERROR_PROPERTY);
+ } else if (nodeInfo.getConnectionAttemptCount() > 0) {
+ row.getLastCell().addProperties(WARNING_PROPERTY);
+ }
+ }
- // Premature crashes
- row.addCell(new HtmlTable.Cell("" + nodeInfo.getPrematureCrashCount()));
- if (nodeInfo.getPrematureCrashCount() >= maxPrematureCrashes) {
- row.getLastCell().addProperties(ERROR_PROPERTY);
- } else if (nodeInfo.getPrematureCrashCount() > 0) {
- row.getLastCell().addProperties(WARNING_PROPERTY);
- }
+ private void addFailedConnectionAttemptCount(NodeInfo nodeInfo, HtmlTable.Row row, long timeSinceContact) {
+ row.addCell(new HtmlTable.Cell("" + nodeInfo.getConnectionAttemptCount()));
+ if (timeSinceContact > 60 * 1000) {
+ row.getLastCell().addProperties(ERROR_PROPERTY);
+ } else if (nodeInfo.getConnectionAttemptCount() > 0) {
+ row.getLastCell().addProperties(WARNING_PROPERTY);
+ }
+ }
+
+ private void addBuildTagVersion(String dominantVtag, NodeInfo nodeInfo, HtmlTable.Row row) {
+ final String buildTagText =
+ nodeInfo.getVtag() != null
+ ? nodeInfo.getVtag()
+ : TAG_NOT_SET;
+ row.addCell(new HtmlTable.Cell(buildTagText));
+ if (! dominantVtag.equals(nodeInfo.getVtag())) {
+ row.getLastCell().addProperties(WARNING_PROPERTY);
+ }
+ }
- // Events last week
- int nodeEvents = eventLog.getNodeEventsSince(nodeInfo.getNode(),
- currentTime - eventLog.getRecentTimePeriod());
- row.addCell(new HtmlTable.Cell("" + nodeEvents));
- if (nodeEvents > 20) {
+ private void addCurrentState(ClusterState state, NodeInfo nodeInfo, HtmlTable.Row row) {
+ NodeState ns = state.getNodeState(nodeInfo.getNode()).clone().setDescription("").setMinUsedBits(16);
+ if (state.getClusterState().oneOf("uir")) {
+ row.addCell(new HtmlTable.Cell(HtmlTable.escape(ns.toString(true))));
+ if (ns.getState().equals(State.DOWN)) {
row.getLastCell().addProperties(ERROR_PROPERTY);
- } else if (nodeEvents > 3) {
+ } else if (ns.getState().oneOf("mi")) {
row.getLastCell().addProperties(WARNING_PROPERTY);
}
+ } else {
+ row.addCell(new HtmlTable.Cell("Cluster " +
+ state.getClusterState().name().toLowerCase()).addProperties(ERROR_PROPERTY));
+ }
+ }
- // Bucket stats for 'default' and 'global' spaces
- if (nodeType.equals(NodeType.STORAGE)) {
- addBucketStats(row, getStatsForContentNode(statsAggregator, nodeInfo, FixedBucketSpaces.defaultSpace()),
- minMergeCompletionRatio);
- addBucketStats(row, getStatsForContentNode(statsAggregator, nodeInfo, FixedBucketSpaces.globalSpace()),
- minMergeCompletionRatio);
- } else {
- addBucketStats(row, getStatsForDistributorNode(statsAggregator, nodeInfo, FixedBucketSpaces.defaultSpace()),
- minMergeCompletionRatio);
- addBucketStats(row, getStatsForDistributorNode(statsAggregator, nodeInfo, FixedBucketSpaces.globalSpace()),
- minMergeCompletionRatio);
- }
-
- // Start time
- if (nodeInfo.getStartTimestamp() == 0) {
- row.addCell(new HtmlTable.Cell("-").addProperties(ERROR_PROPERTY).addProperties(CENTERED_PROPERTY));
+ private void addWantedState(NodeInfo nodeInfo, HtmlTable.Row row) {
+ if (nodeInfo.getWantedState() == null || nodeInfo.getWantedState().getState().equals(State.UP)) {
+ row.addCell(new HtmlTable.Cell("-").addProperties(CENTERED_PROPERTY));
+ } else {
+ row.addCell(new HtmlTable.Cell(HtmlTable.escape(nodeInfo.getWantedState().toString(true))));
+ if (nodeInfo.getWantedState().toString(true).indexOf("Disabled by fleet controller") != -1) {
+ row.getLastCell().addProperties(ERROR_PROPERTY);
} else {
- String startTime = RealTimer.printDateNoMilliSeconds(
- 1000 * nodeInfo.getStartTimestamp(), utcTimeZone);
- row.addCell(new HtmlTable.Cell(HtmlTable.escape(startTime)));
+ row.getLastCell().addProperties(WARNING_PROPERTY);
}
+ }
+ }
- // RPC address
- if (nodeInfo.getRpcAddress() == null) {
- row.addCell(new HtmlTable.Cell("-").addProperties(ERROR_PROPERTY));
- } else {
- row.addCell(new HtmlTable.Cell(HtmlTable.escape(nodeInfo.getRpcAddress())));
- if (nodeInfo.isRpcAddressOutdated()) {
- row.getLastCell().addProperties(WARNING_PROPERTY);
- }
- }
- table.addRow(row);
+ private void addReportedState(NodeInfo nodeInfo, HtmlTable.Row row) {
+ NodeState reportedState = nodeInfo.getReportedState().clone().setStartTimestamp(0);
+ row.addCell(new HtmlTable.Cell(HtmlTable.escape(reportedState.toString(true))));
+ if (!nodeInfo.getReportedState().getState().equals(State.UP)) {
+ row.getLastCell().addProperties(WARNING_PROPERTY);
}
}
+ private void addNodeIndex(String pathPrefix, NodeInfo nodeInfo, HtmlTable.Row row) {
+ row.addCell(new HtmlTable.Cell("<a href=\"" + pathPrefix + "/node=" + nodeInfo.getNode()
+ + "\">" + nodeInfo.getNodeIndex() + "</a>"));
+ }
+
private static ContentNodeStats.BucketSpaceStats getStatsForContentNode(ClusterStatsAggregator statsAggregator,
NodeInfo nodeInfo,
String bucketSpace) {
@@ -355,7 +390,7 @@ public class VdsClusterHtmlRendrer {
.append("3) SPT - State pending time - Time the current getNodeState request has been " +
"pending.<br>\n")
.append("4) SSV - System state version - The latest system state version the node has " +
- "acknowledged.<br>\n")
+ "acknowledged (last <em>activated</em> state version in parentheses if this is not equal to SSV).<br>\n")
.append("5) PC - Premature crashes - Number of times node has crashed since last time it had " +
"been stable in up or down state for more than "
+ RealTimer.printDuration(stableStateTimePeriode) + ".<br>\n")
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateBundleTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateBundleTest.java
index 7dccae988df..339d305e823 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateBundleTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateBundleTest.java
@@ -4,8 +4,11 @@ package com.yahoo.vespa.clustercontroller.core;
import com.yahoo.vdslib.state.*;
import org.junit.Test;
+import java.util.function.Function;
+
import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.MatcherAssert.assertThat;
+import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
@@ -19,7 +22,7 @@ public class ClusterStateBundleTest {
return AnnotatedClusterState.withoutAnnotations(stateOf(state));
}
- private static ClusterStateBundle createTestBundle(boolean modifyDefaultSpace) {
+ private static ClusterStateBundle.Builder createTestBundleBuilder(boolean modifyDefaultSpace) {
return ClusterStateBundle
.builder(annotatedStateOf("distributor:2 storage:2"))
.bucketSpaces("default", "global", "narnia")
@@ -33,8 +36,11 @@ public class ClusterStateBundleTest {
.setNodeState(Node.ofDistributor(0), new NodeState(NodeType.DISTRIBUTOR, State.DOWN));
}
return derived;
- })
- .deriveAndBuild();
+ });
+ }
+
+ private static ClusterStateBundle createTestBundle(boolean modifyDefaultSpace) {
+ return createTestBundleBuilder(modifyDefaultSpace).deriveAndBuild();
}
private static ClusterStateBundle createTestBundle() {
@@ -96,4 +102,75 @@ public class ClusterStateBundleTest {
"narnia 'distributor:2 .0.s:d storage:2')"));
}
+ @Test
+ public void toString_without_derived_states_specifies_deferred_activation_iff_set() {
+ var bundle = ClusterStateBundle.ofBaselineOnly(annotatedStateOf("distributor:2 storage:2"), true);
+ assertThat(bundle.toString(), equalTo("ClusterStateBundle('distributor:2 storage:2' (deferred activation))"));
+ }
+
+ @Test
+ public void toString_without_derived_states_does_not_specify_deferred_activation_iff_not_set() {
+ var bundle = ClusterStateBundle.ofBaselineOnly(annotatedStateOf("distributor:2 storage:2"), false);
+ assertThat(bundle.toString(), equalTo("ClusterStateBundle('distributor:2 storage:2')"));
+ }
+
+ @Test
+ public void toString_with_derived_states_specifies_deferred_activation_iff_set() {
+ var bundle = createTestBundleBuilder(true).deferredActivation(true).deriveAndBuild();
+ assertThat(bundle.toString(), equalTo("ClusterStateBundle('distributor:2 storage:2', " +
+ "default 'distributor:2 storage:2 .0.s:d', " +
+ "global 'distributor:2 storage:2', " +
+ "narnia 'distributor:2 .0.s:d storage:2' (deferred activation))"));
+ }
+
+ @Test
+ public void toString_with_derived_states_does_not_specify_deferred_activation_iff_not_set() {
+ var bundle = createTestBundleBuilder(true).deferredActivation(false).deriveAndBuild();
+ assertThat(bundle.toString(), equalTo("ClusterStateBundle('distributor:2 storage:2', " +
+ "default 'distributor:2 storage:2 .0.s:d', " +
+ "global 'distributor:2 storage:2', " +
+ "narnia 'distributor:2 .0.s:d storage:2')"));
+ }
+
+ @Test
+ public void deferred_activation_is_disabled_by_default() {
+ ClusterStateBundle bundle = createTestBundle();
+ assertFalse(bundle.deferredActivation());
+ }
+
+ @Test
+ public void can_build_bundle_with_deferred_activation_enabled() {
+ var bundle = createTestBundleBuilder(false).deferredActivation(true).deriveAndBuild();
+ assertTrue(bundle.deferredActivation());
+ }
+
+ @Test
+ public void can_build_bundle_with_deferred_activation_disabled() {
+ var bundle = createTestBundleBuilder(false).deferredActivation(false).deriveAndBuild();
+ assertFalse(bundle.deferredActivation());
+ }
+
+ @Test
+ public void simple_bundle_without_derived_states_propagates_deferred_activation_flag() {
+ var bundle = ClusterStateBundle
+ .builder(annotatedStateOf("distributor:2 storage:2"))
+ .deferredActivation(true) // defaults to false
+ .deriveAndBuild();
+ assertTrue(bundle.deferredActivation());
+ }
+
+ @Test
+ public void cloning_preserves_false_deferred_activation_flag() {
+ var bundle = createTestBundleBuilder(true).deferredActivation(false).deriveAndBuild();
+ var derived = bundle.cloneWithMapper(Function.identity());
+ assertEquals(bundle, derived);
+ }
+
+ @Test
+ public void cloning_preserves_true_deferred_activation_flag() {
+ var bundle = createTestBundleBuilder(true).deferredActivation(true).deriveAndBuild();
+ var derived = bundle.cloneWithMapper(Function.identity());
+ assertEquals(bundle, derived);
+ }
+
}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateBundleUtil.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateBundleUtil.java
index 00c2194205d..cceb6d6f03f 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateBundleUtil.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateBundleUtil.java
@@ -12,6 +12,12 @@ import java.util.stream.Stream;
*/
public class ClusterStateBundleUtil {
+ public static ClusterStateBundle.Builder makeBundleBuilder(String baselineState, StateMapping... bucketSpaceStates) {
+ return ClusterStateBundle.builder(AnnotatedClusterState.withoutAnnotations(ClusterState.stateFromString(baselineState)))
+ .explicitDerivedStates(Stream.of(bucketSpaceStates).collect(Collectors.toMap(sm -> sm.bucketSpace,
+ sm -> AnnotatedClusterState.withoutAnnotations(sm.state))));
+ }
+
public static ClusterStateBundle makeBundle(String baselineState, StateMapping... bucketSpaceStates) {
return ClusterStateBundle.of(AnnotatedClusterState.withoutAnnotations(ClusterState.stateFromString(baselineState)),
Stream.of(bucketSpaceStates).collect(Collectors.toMap(sm -> sm.bucketSpace,
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ContentClusterHtmlRendrerTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ContentClusterHtmlRendrerTest.java
index f5adf644c28..9a3edf8e681 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ContentClusterHtmlRendrerTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ContentClusterHtmlRendrerTest.java
@@ -29,7 +29,10 @@ public class ContentClusterHtmlRendrerTest {
@Before
public void before() throws JSONException, ParseException {
- final ClusterState state = new ClusterState("version:34633 bits:24 distributor:211 storage:211");
+ final ClusterStateBundle stateBundle = ClusterStateBundle.ofBaselineOnly(
+ AnnotatedClusterState.withoutAnnotations(
+ ClusterState.stateFromString("version:34633 bits:24 distributor:211 storage:211")));
+ final ClusterState state = stateBundle.getBaselineClusterState();
final EventLog eventLog = new EventLog(new FakeTimer(), null);
final VdsClusterHtmlRendrer.Table table = rendrer.createNewClusterHtmlTable(clusterName, slobrokGeneration);
@@ -55,7 +58,7 @@ public class ContentClusterHtmlRendrerTest {
storageNodeInfoByIndex,
distributorNodeInfoByIndex,
new FakeTimer(),
- state,
+ stateBundle,
statsAggregator,
1.0,
10,
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DatabaseTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DatabaseTest.java
index e54a333a15e..27dcd009c96 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DatabaseTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DatabaseTest.java
@@ -49,7 +49,7 @@ public class DatabaseTest extends FleetControllerTest {
@Test
public void testWantedStatesInZooKeeper() throws Exception {
startingTest("DatabaseTest::testWantedStatesInZooKeeper");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
options.zooKeeperServerAddress = "127.0.0.1";
setUpFleetController(true, options);
setUpVdsNodes(true, new DummyVdsNodeOptions());
@@ -105,7 +105,7 @@ public class DatabaseTest extends FleetControllerTest {
@Test
public void testWantedStateOfUnknownNode() throws Exception {
startingTest("DatabaseTest::testWantedStatesOfUnknownNode");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
options.minRatioOfDistributorNodesUp = 0;
options.minRatioOfStorageNodesUp = 0;
options.zooKeeperServerAddress = "localhost";
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DistributionBitCountTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DistributionBitCountTest.java
index ae59336b5ef..77164b678c6 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DistributionBitCountTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DistributionBitCountTest.java
@@ -20,7 +20,7 @@ public class DistributionBitCountTest extends FleetControllerTest {
for (int i = 0 ; i < 10; i++) {
configuredNodes.add(new ConfiguredNode(i, false));
}
- FleetControllerOptions options = new FleetControllerOptions("mycluster", configuredNodes);
+ FleetControllerOptions options = defaultOptions("mycluster", configuredNodes);
options.distributionBits = 17;
setUpFleetController(false, options);
startingTest(testName);
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyCommunicator.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyCommunicator.java
index 5d200d65516..8314839336e 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyCommunicator.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyCommunicator.java
@@ -24,7 +24,7 @@ public class DummyCommunicator implements Communicator, NodeLookup {
this.shouldDeferDistributorClusterStateAcks = shouldDeferDistributorClusterStateAcks;
}
- public class DummyGetNodeStateRequest extends GetNodeStateRequest {
+ class DummyGetNodeStateRequest extends GetNodeStateRequest {
Waiter<GetNodeStateRequest> waiter;
public DummyGetNodeStateRequest(NodeInfo nodeInfo, Waiter<GetNodeStateRequest> waiter) {
@@ -47,6 +47,14 @@ public class DummyCommunicator implements Communicator, NodeLookup {
}
+ public class DummyActivateClusterStateVersionRequest extends ActivateClusterStateVersionRequest {
+
+ public DummyActivateClusterStateVersionRequest(NodeInfo nodeInfo, int stateVersion) {
+ super(nodeInfo, stateVersion);
+ }
+
+ }
+
private Map<Node, DummyGetNodeStateRequest> getNodeStateRequests = new TreeMap<>();
public DummyCommunicator(List<Node> nodeList, Timer timer) {
@@ -89,7 +97,7 @@ public class DummyCommunicator implements Communicator, NodeLookup {
public void setSystemState(ClusterStateBundle stateBundle, NodeInfo node, Waiter<SetClusterStateRequest> waiter) {
ClusterState baselineState = stateBundle.getBaselineClusterState();
DummySetClusterStateRequest req = new DummySetClusterStateRequest(node, baselineState);
- node.setSystemStateVersionSent(baselineState);
+ node.setClusterStateVersionBundleSent(stateBundle);
req.setReply(new SetClusterStateRequest.Reply());
if (node.isStorage() || !shouldDeferDistributorClusterStateAcks) {
waiter.done(req);
@@ -98,6 +106,13 @@ public class DummyCommunicator implements Communicator, NodeLookup {
}
}
+ @Override
+ public void activateClusterStateVersion(int clusterStateVersion, NodeInfo node, Waiter<ActivateClusterStateVersionRequest> waiter) {
+ var req = new DummyActivateClusterStateVersionRequest(node, clusterStateVersion);
+ req.setReply(ActivateClusterStateVersionRequest.Reply.withActualVersion(clusterStateVersion));
+ waiter.done(req);
+ }
+
public void sendAllDeferredDistributorClusterStateAcks() {
deferredClusterStateAcks.forEach(reqAndWaiter -> reqAndWaiter.getFirst().done(reqAndWaiter.getSecond()));
deferredClusterStateAcks.clear();
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyVdsNode.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyVdsNode.java
index 6d59a672e86..bd68f0fa343 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyVdsNode.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyVdsNode.java
@@ -68,6 +68,7 @@ public class DummyVdsNode {
* Any access to this list or to its members must be synchronized on the timer variable.
*/
private List<ClusterStateBundle> clusterStateBundles = new LinkedList<>();
+ private int activatedClusterStateVersion = 0;
private Thread messageResponder = new Thread() {
public void run() {
@@ -220,6 +221,12 @@ public class DummyVdsNode {
}
}
+ public int getActivatedClusterStateVersion() {
+ synchronized (timer) {
+ return activatedClusterStateVersion;
+ }
+ }
+
public boolean hasPendingGetNodeStateRequest() {
synchronized (timer) {
return !waitingRequests.isEmpty();
@@ -300,14 +307,21 @@ public class DummyVdsNode {
public ClusterStateBundle getClusterStateBundle() {
synchronized(timer) {
- return (clusterStateBundles.isEmpty() ? null : clusterStateBundles.get(0));
+ // In a two-phase state activation scenario, bundles are added to `clusterStateBundles` _before_
+ // the version has been activated. Since we want this method to only return _activated_ bundles
+ // we filter out versions that are not yet activated. In a non two-phase scenario the activated
+ // version is implicitly the same as the most recently received bundle, so the filter is a no-op.
+ return clusterStateBundles.stream()
+ .filter(b -> b.getVersion() <= activatedClusterStateVersion)
+ .findFirst() // Most recent cluster state bundle first in list
+ .orElse(null);
}
}
public ClusterState getClusterState() {
- synchronized(timer) {
- return (clusterStateBundles.isEmpty() ? null : clusterStateBundles.get(0).getBaselineClusterState());
- }
+ return Optional.ofNullable(getClusterStateBundle())
+ .map(b -> b.getBaselineClusterState())
+ .orElse(null);
}
public String getSlobrokName() {
@@ -369,6 +383,13 @@ public class DummyVdsNode {
m.paramDesc(2, "payload", "Slime format payload");
supervisor.addMethod(m);
}
+ if (stateCommunicationVersion >= RPCCommunicator.ACTIVATE_CLUSTER_STATE_VERSION_RPC_VERSION) {
+ m = new Method(RPCCommunicator.ACTIVATE_CLUSTER_STATE_VERSION_RPC_METHOD_NAME, "i", "i", this, "rpc_activateClusterStateVersion");
+ m.methodDesc("Activate a given cluster state version");
+ m.paramDesc(0, "stateVersion", "Cluster state version to activate");
+ m.returnDesc(0, "actualVersion", "Actual cluster state version on node");
+ supervisor.addMethod(m);
+ }
}
public void rpc_storageConnect(Request req) {
@@ -439,7 +460,7 @@ public class DummyVdsNode {
}
}
} catch (Exception e) {
- log.log(LogLevel.ERROR, "Dummy node " + this + ": An error occured when answering " + req.methodName() + " request: " + e.getMessage());
+ log.log(LogLevel.ERROR, "Dummy node " + this + ": An error occurred when answering " + req.methodName() + " request: " + e.getMessage());
e.printStackTrace(System.err);
req.setError(ErrorCode.METHOD_FAILED, e.getMessage());
}
@@ -499,7 +520,7 @@ public class DummyVdsNode {
req.returnValues().add(new StringValue("OK"));
log.log(LogLevel.DEBUG, "Dummy node " + this + ": Got new system state (through old setsystemstate call) " + newState);
} catch (Exception e) {
- log.log(LogLevel.ERROR, "Dummy node " + this + ": An error occured when answering setsystemstate request: " + e.getMessage());
+ log.log(LogLevel.ERROR, "Dummy node " + this + ": An error occurred when answering setsystemstate request: " + e.getMessage());
e.printStackTrace(System.err);
req.returnValues().add(new Int32Value(ErrorCode.METHOD_FAILED));
req.returnValues().add(new StringValue(e.getMessage()));
@@ -516,11 +537,14 @@ public class DummyVdsNode {
synchronized(timer) {
updateStartTimestamps(newState);
clusterStateBundles.add(0, ClusterStateBundle.ofBaselineOnly(AnnotatedClusterState.withoutAnnotations(newState)));
+ if (stateCommunicationVersion < RPCCommunicator.ACTIVATE_CLUSTER_STATE_VERSION_RPC_VERSION) {
+ activatedClusterStateVersion = newState.getVersion(); // Simulate node that does not know of activation
+ }
timer.notifyAll();
}
log.log(LogLevel.DEBUG, "Dummy node " + this + ": Got new system state " + newState);
} catch (Exception e) {
- log.log(LogLevel.ERROR, "Dummy node " + this + ": An error occured when answering setsystemstate request: " + e.getMessage());
+ log.log(LogLevel.ERROR, "Dummy node " + this + ": An error occurred when answering setsystemstate request: " + e.getMessage());
e.printStackTrace(System.err);
req.setError(ErrorCode.METHOD_FAILED, e.getMessage());
}
@@ -536,11 +560,41 @@ public class DummyVdsNode {
synchronized(timer) {
updateStartTimestamps(stateBundle.getBaselineClusterState());
clusterStateBundles.add(0, stateBundle);
+ if (stateCommunicationVersion < RPCCommunicator.ACTIVATE_CLUSTER_STATE_VERSION_RPC_VERSION) {
+ activatedClusterStateVersion = stateBundle.getVersion(); // Simulate node that does not know of activation
+ }
timer.notifyAll();
}
log.log(LogLevel.DEBUG, "Dummy node " + this + ": Got new cluster state " + stateBundle);
} catch (Exception e) {
- log.log(LogLevel.ERROR, "Dummy node " + this + ": An error occured when answering setdistributionstates request: " + e.getMessage());
+ log.log(LogLevel.ERROR, "Dummy node " + this + ": An error occurred when answering setdistributionstates request: " + e.getMessage());
+ e.printStackTrace(System.err);
+ req.setError(ErrorCode.METHOD_FAILED, e.getMessage());
+ }
+ }
+
+ public void rpc_activateClusterStateVersion(Request req) {
+ try {
+ if (shouldFailSetSystemStateRequests()) {
+ // We assume that failing setDistributionStates also implies failing version activations
+ req.setError(ErrorCode.GENERAL_ERROR, "Dummy node configured to fail activateClusterStateVersion() calls");
+ return;
+ }
+ int activateVersion = req.parameters().get(0).asInt32();
+ synchronized(timer) {
+ int actualVersion = getLatestSystemStateVersion().orElse(0);
+ req.returnValues().add(new Int32Value(actualVersion));
+ if (activateVersion == actualVersion) {
+ activatedClusterStateVersion = activateVersion;
+ timer.notifyAll();
+ } else {
+ log.log(LogLevel.DEBUG, () -> String.format("Dummy node %s: got a mismatching activation (request version %d, " +
+ "actual %d), not marking version as active", this, activateVersion, actualVersion));
+ }
+ }
+ log.log(LogLevel.DEBUG, "Dummy node " + this + ": Activating cluster state version " + activateVersion);
+ } catch (Exception e) {
+ log.log(LogLevel.ERROR, "Dummy node " + this + ": An error occurred when answering activate_cluster_state_version request: " + e.getMessage());
e.printStackTrace(System.err);
req.setError(ErrorCode.METHOD_FAILED, e.getMessage());
}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyVdsNodeOptions.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyVdsNodeOptions.java
index bda06248d9e..bf63aebe022 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyVdsNodeOptions.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyVdsNodeOptions.java
@@ -8,6 +8,6 @@ import com.yahoo.vespa.clustercontroller.core.rpc.RPCCommunicator;
* over regular RPC.
*/
public class DummyVdsNodeOptions {
- // 0 - 4.1, 1 - 4.2-5.0.10, 2 - 5.0.11+, 3 - 6.220+
- public int stateCommunicationVersion = RPCCommunicator.SET_DISTRIBUTION_STATES_RPC_VERSION;
+ // 0 - 4.1, 1 - 4.2-5.0.10, 2 - 5.0.11+, 3 - 6.220+, 4 - 7.24+
+ public int stateCommunicationVersion = RPCCommunicator.ACTIVATE_CLUSTER_STATE_VERSION_RPC_VERSION;
}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FleetControllerTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FleetControllerTest.java
index 21d9b0a7a1f..5ecc33ae964 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FleetControllerTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FleetControllerTest.java
@@ -119,6 +119,18 @@ public abstract class FleetControllerTest implements Waiter {
testName = name;
}
+ static protected FleetControllerOptions defaultOptions(String clusterName) {
+ var opts = new FleetControllerOptions(clusterName);
+ opts.enableTwoPhaseClusterStateActivation = true; // Enable by default, tests can explicitly disable.
+ return opts;
+ }
+
+ static protected FleetControllerOptions defaultOptions(String clusterName, Collection<ConfiguredNode> nodes) {
+ var opts = new FleetControllerOptions(clusterName, nodes);
+ opts.enableTwoPhaseClusterStateActivation = true; // Enable by default, tests can explicitly disable.
+ return opts;
+ }
+
protected void setUpSystem(boolean useFakeTimer, FleetControllerOptions options) throws Exception {
log.log(LogLevel.DEBUG, "Setting up system");
slobrok = new Slobrok();
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownLiveConfigTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownLiveConfigTest.java
index 4a3eef559aa..4805257ea7a 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownLiveConfigTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownLiveConfigTest.java
@@ -18,7 +18,7 @@ public class GroupAutoTakedownLiveConfigTest extends FleetControllerTest {
private static FleetControllerOptions createOptions(
DistributionBuilder.GroupBuilder groupBuilder, double minNodeRatio)
{
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
options.setStorageDistribution(DistributionBuilder.forHierarchicCluster(groupBuilder));
options.nodes = DistributionBuilder.buildConfiguredNodes(groupBuilder.totalNodeCount())
.stream().collect(Collectors.toSet());
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/MasterElectionTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/MasterElectionTest.java
index 9d6e39f244a..23389de3fad 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/MasterElectionTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/MasterElectionTest.java
@@ -119,7 +119,7 @@ public class MasterElectionTest extends FleetControllerTest {
public void testMasterElection() throws Exception {
startingTest("MasterElectionTest::testMasterElection");
log.log(LogLevel.INFO, "STARTING TEST: MasterElectionTest::testMasterElection()");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
options.masterZooKeeperCooldownPeriod = 1;
setUpFleetController(5, true, options);
waitForMaster(0);
@@ -223,7 +223,7 @@ public class MasterElectionTest extends FleetControllerTest {
@Test
public void testClusterStateVersionIncreasesAcrossMasterElections() throws Exception {
startingTest("MasterElectionTest::testClusterStateVersionIncreasesAcrossMasterElections");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
options.masterZooKeeperCooldownPeriod = 1;
setUpFleetController(5, true, options);
// Currently need to have content nodes present for the cluster controller to even bother
@@ -248,7 +248,7 @@ public class MasterElectionTest extends FleetControllerTest {
@Test
public void testVotingCorrectnessInFaceOfZKDisconnect() throws Exception {
startingTest("MasterElectionTest::testVotingCorrectnessInFaceOfZKDisconnect");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
// "Magic" port value is in range allocated to module for testing.
zooKeeperServer = ZooKeeperTestServer.createWithFixedPort(18342);
options.zooKeeperSessionTimeout = 100;
@@ -272,7 +272,7 @@ public class MasterElectionTest extends FleetControllerTest {
@Test
public void testZooKeeperUnavailable() throws Exception {
startingTest("MasterElectionTest::testZooKeeperUnavailable");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
options.zooKeeperSessionTimeout = 100;
options.masterZooKeeperCooldownPeriod = 100;
options.zooKeeperServerAddress = "localhost";
@@ -308,7 +308,7 @@ public class MasterElectionTest extends FleetControllerTest {
@Ignore
public void testMasterZooKeeperCooldown() throws Exception {
startingTest("MasterElectionTest::testMasterZooKeeperCooldown");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
options.masterZooKeeperCooldownPeriod = 3600 * 1000; // An hour
setUpFleetController(3, true, options);
waitForMaster(0);
@@ -349,7 +349,7 @@ public class MasterElectionTest extends FleetControllerTest {
@Ignore
public void testGetMaster() throws Exception {
startingTest("MasterElectionTest::testGetMaster");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
options.masterZooKeeperCooldownPeriod = 3600 * 1000; // An hour
setUpFleetController(3, true, options);
waitForMaster(0);
@@ -429,7 +429,7 @@ public class MasterElectionTest extends FleetControllerTest {
@Test
public void testReconfigure() throws Exception {
startingTest("MasterElectionTest::testReconfigure");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
options.masterZooKeeperCooldownPeriod = 1;
setUpFleetController(3, true, options);
waitForMaster(0);
@@ -454,7 +454,7 @@ public class MasterElectionTest extends FleetControllerTest {
@Test
public void cluster_state_version_written_to_zookeeper_even_with_empty_send_set() throws Exception {
startingTest("MasterElectionTest::cluster_state_version_written_to_zookeeper_even_with_empty_send_set");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
options.masterZooKeeperCooldownPeriod = 1;
options.minRatioOfDistributorNodesUp = 0;
options.minRatioOfStorageNodesUp = 0;
@@ -500,7 +500,7 @@ public class MasterElectionTest extends FleetControllerTest {
@Test
public void previously_published_state_is_taken_into_account_for_default_space_when_controller_bootstraps() throws Exception {
startingTest("MasterElectionTest::previously_published_state_is_taken_into_account_for_default_space_when_controller_bootstraps");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
options.clusterHasGlobalDocumentTypes = true;
options.masterZooKeeperCooldownPeriod = 1;
options.minTimeBeforeFirstSystemStateBroadcast = 100000;
@@ -543,7 +543,7 @@ public class MasterElectionTest extends FleetControllerTest {
@Test
public void default_space_nodes_not_marked_as_maintenance_when_cluster_has_no_global_document_types() throws Exception {
startingTest("MasterElectionTest::default_space_nodes_not_marked_as_maintenance_when_cluster_has_no_global_document_types");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
options.clusterHasGlobalDocumentTypes = false;
options.masterZooKeeperCooldownPeriod = 1;
options.minTimeBeforeFirstSystemStateBroadcast = 100000;
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NoZooKeeperTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NoZooKeeperTest.java
index 22831a04527..dff33d7a6fc 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NoZooKeeperTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NoZooKeeperTest.java
@@ -9,7 +9,7 @@ public class NoZooKeeperTest extends FleetControllerTest {
@Test
public void testWantedStatesInZooKeeper() throws Exception {
startingTest("NoZooKeeperTest::testWantedStatesInZooKeeper");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
options.zooKeeperServerAddress = null;
setUpFleetController(true, options);
setUpVdsNodes(true, new DummyVdsNodeOptions());
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeSlobrokConfigurationMembershipTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeSlobrokConfigurationMembershipTest.java
index a2b21f10741..9003e369b59 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeSlobrokConfigurationMembershipTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeSlobrokConfigurationMembershipTest.java
@@ -28,7 +28,7 @@ public class NodeSlobrokConfigurationMembershipTest extends FleetControllerTest
}
private FleetControllerOptions optionsForConfiguredNodes(Set<ConfiguredNode> configuredNodes) {
- FleetControllerOptions options = new FleetControllerOptions("mycluster", configuredNodes);
+ FleetControllerOptions options = defaultOptions("mycluster", configuredNodes);
options.maxSlobrokDisconnectGracePeriod = 60 * 1000;
options.nodeStateRequestTimeoutMS = 10000 * 60 * 1000;
options.maxTransitionTime = transitionTimes(0);
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/RpcServerTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/RpcServerTest.java
index fadafdc7d32..214ccfa1c5b 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/RpcServerTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/RpcServerTest.java
@@ -85,7 +85,7 @@ public class RpcServerTest extends FleetControllerTest {
public void testFailOccasionallyAndIgnoreToSeeIfOtherTestsThenWork() {
try{
startingTest("RpcServerTest::testFailOccasionallyAndIgnoreToSeeIfOtherTestsThenWork");
- setUpFleetController(true, new FleetControllerOptions("mycluster"));
+ setUpFleetController(true, defaultOptions("mycluster"));
setUpVdsNodes(true, new DummyVdsNodeOptions());
waitForStableSystem();
} catch (Throwable t) {}
@@ -95,7 +95,7 @@ public class RpcServerTest extends FleetControllerTest {
public void testGetSystemState() throws Exception {
LogFormatter.initializeLogging();
startingTest("RpcServerTest::testGetSystemState");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
setUpFleetController(true, options);
setUpVdsNodes(true, new DummyVdsNodeOptions());
waitForStableSystem();
@@ -165,7 +165,7 @@ public class RpcServerTest extends FleetControllerTest {
Set<ConfiguredNode> configuredNodes = new TreeSet<>();
for (int i = 0; i < 10; i++)
configuredNodes.add(new ConfiguredNode(i, false));
- FleetControllerOptions options = new FleetControllerOptions("mycluster", configuredNodes);
+ FleetControllerOptions options = defaultOptions("mycluster", configuredNodes);
options.minRatioOfStorageNodesUp = 0;
options.maxInitProgressTime = 30000;
options.stableStateTimePeriod = 60000;
@@ -284,7 +284,7 @@ public class RpcServerTest extends FleetControllerTest {
for (int i = 0; i < 9; i++)
configuredNodes.add(new ConfiguredNode(i, false));
configuredNodes.add(new ConfiguredNode(9, true)); // Last node is configured retired
- FleetControllerOptions options = new FleetControllerOptions("mycluster", configuredNodes);
+ FleetControllerOptions options = defaultOptions("mycluster", configuredNodes);
options.minRatioOfStorageNodesUp = 0;
options.maxInitProgressTime = 30000;
options.stableStateTimePeriod = 60000;
@@ -319,7 +319,7 @@ public class RpcServerTest extends FleetControllerTest {
List<ConfiguredNode> configuredNodes = new ArrayList<>();
for (int i = 0; i < 5; i++)
configuredNodes.add(new ConfiguredNode(i, false));
- FleetControllerOptions options = new FleetControllerOptions("mycluster", configuredNodes);
+ FleetControllerOptions options = defaultOptions("mycluster", configuredNodes);
options.maxInitProgressTime = 30000;
options.stableStateTimePeriod = 60000;
setUpFleetController(true, options);
@@ -343,7 +343,7 @@ public class RpcServerTest extends FleetControllerTest {
configuredNodes.add(new ConfiguredNode(i, true));
configuredNodes.add(new ConfiguredNode(5, false));
configuredNodes.add(new ConfiguredNode(6, false));
- FleetControllerOptions options = new FleetControllerOptions("mycluster", configuredNodes);
+ FleetControllerOptions options = defaultOptions("mycluster", configuredNodes);
options.slobrokConnectionSpecs = this.options.slobrokConnectionSpecs;
this.options.maxInitProgressTime = 30000;
this.options.stableStateTimePeriod = 60000;
@@ -373,7 +373,7 @@ public class RpcServerTest extends FleetControllerTest {
Set<ConfiguredNode> configuredNodes = new TreeSet<>();
for (int i = 0; i < 7; i++)
configuredNodes.add(new ConfiguredNode(i, false));
- FleetControllerOptions options = new FleetControllerOptions("mycluster", configuredNodes);
+ FleetControllerOptions options = defaultOptions("mycluster", configuredNodes);
options.slobrokConnectionSpecs = this.options.slobrokConnectionSpecs;
this.options.maxInitProgressTime = 30000;
this.options.stableStateTimePeriod = 60000;
@@ -400,7 +400,7 @@ public class RpcServerTest extends FleetControllerTest {
List<ConfiguredNode> configuredNodes = new ArrayList<>();
for (int i = 0; i < 5; i++)
configuredNodes.add(new ConfiguredNode(i, false));
- FleetControllerOptions options = new FleetControllerOptions("mycluster", configuredNodes);
+ FleetControllerOptions options = defaultOptions("mycluster", configuredNodes);
options.maxInitProgressTime = 30000;
options.stableStateTimePeriod = 60000;
setUpFleetController(true, options);
@@ -412,7 +412,7 @@ public class RpcServerTest extends FleetControllerTest {
Set<ConfiguredNode> configuredNodes = new TreeSet<>();
for (int i = 0; i < 5; i++)
configuredNodes.add(new ConfiguredNode(i, false));
- FleetControllerOptions options = new FleetControllerOptions("mycluster", configuredNodes);
+ FleetControllerOptions options = defaultOptions("mycluster", configuredNodes);
options.slobrokConnectionSpecs = this.options.slobrokConnectionSpecs;
this.options.maxInitProgressTime = 30000;
this.options.stableStateTimePeriod = 60000;
@@ -427,7 +427,7 @@ public class RpcServerTest extends FleetControllerTest {
configuredNodes.add(new ConfiguredNode(i, true));
configuredNodes.add(new ConfiguredNode(5, false));
configuredNodes.add(new ConfiguredNode(6, false));
- FleetControllerOptions options = new FleetControllerOptions("mycluster", configuredNodes);
+ FleetControllerOptions options = defaultOptions("mycluster", configuredNodes);
options.slobrokConnectionSpecs = this.options.slobrokConnectionSpecs;
this.options.maxInitProgressTime = 30000;
this.options.stableStateTimePeriod = 60000;
@@ -441,7 +441,7 @@ public class RpcServerTest extends FleetControllerTest {
configuredNodes.add(new ConfiguredNode(i, true));
configuredNodes.add(new ConfiguredNode(5, false));
configuredNodes.add(new ConfiguredNode(6, false));
- FleetControllerOptions options = new FleetControllerOptions("mycluster", configuredNodes);
+ FleetControllerOptions options = defaultOptions("mycluster", configuredNodes);
options.slobrokConnectionSpecs = this.options.slobrokConnectionSpecs;
this.options.maxInitProgressTime = 30000;
this.options.stableStateTimePeriod = 60000;
@@ -489,7 +489,7 @@ public class RpcServerTest extends FleetControllerTest {
@Test
public void testSetNodeState() throws Exception {
startingTest("RpcServerTest::testSetNodeState");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
Set<Integer> nodeIndexes = new TreeSet<>(Arrays.asList(new Integer[]{4, 6, 9, 10, 14, 16, 21, 22, 23, 25}));
options.setStorageDistribution(new Distribution(getDistConfig(nodeIndexes)));
setUpFleetController(true, options);
@@ -535,7 +535,7 @@ public class RpcServerTest extends FleetControllerTest {
@Test
public void testSetNodeStateOutOfRange() throws Exception {
startingTest("RpcServerTest::testSetNodeStateOutOfRange");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
options.setStorageDistribution(new Distribution(Distribution.getDefaultDistributionConfig(2, 10)));
setUpFleetController(true, options);
setUpVdsNodes(true, new DummyVdsNodeOptions());
@@ -572,7 +572,7 @@ public class RpcServerTest extends FleetControllerTest {
@Test
public void testGetMaster() throws Exception {
startingTest("RpcServerTest::testGetMaster");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
options.setStorageDistribution(new Distribution(Distribution.getDefaultDistributionConfig(2, 10)));
setUpFleetController(true, options);
setUpVdsNodes(true, new DummyVdsNodeOptions());
@@ -594,7 +594,7 @@ public class RpcServerTest extends FleetControllerTest {
@Test
public void testGetNodeList() throws Exception {
startingTest("RpcServerTest::testGetNodeList");
- setUpFleetController(true, new FleetControllerOptions("mycluster"));
+ setUpFleetController(true, defaultOptions("mycluster"));
setUpVdsNodes(true, new DummyVdsNodeOptions());
waitForStableSystem();
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/RpcVersionAutoDowngradeTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/RpcVersionAutoDowngradeTest.java
index d59dbb4933a..b76f1fc20bf 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/RpcVersionAutoDowngradeTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/RpcVersionAutoDowngradeTest.java
@@ -4,6 +4,7 @@ package com.yahoo.vespa.clustercontroller.core;
import com.yahoo.vdslib.distribution.ConfiguredNode;
import com.yahoo.vdslib.state.NodeState;
import com.yahoo.vdslib.state.State;
+import com.yahoo.vespa.clustercontroller.core.testutils.StateWaiter;
import org.junit.Test;
import java.util.ArrayList;
@@ -16,7 +17,7 @@ public class RpcVersionAutoDowngradeTest extends FleetControllerTest {
for (int i = 0 ; i < 10; i++) {
configuredNodes.add(new ConfiguredNode(i, false));
}
- FleetControllerOptions options = new FleetControllerOptions("mycluster", configuredNodes);
+ FleetControllerOptions options = defaultOptions("mycluster", configuredNodes);
setUpFleetController(false, options);
DummyVdsNodeOptions nodeOptions = new DummyVdsNodeOptions();
nodeOptions.stateCommunicationVersion = nodeRpcVersion;
@@ -29,8 +30,16 @@ public class RpcVersionAutoDowngradeTest extends FleetControllerTest {
@Test
public void cluster_state_rpc_version_is_auto_downgraded_and_retried_for_older_nodes() throws Exception {
- setUpFakeCluster(2); // HEAD is at v3
+ setUpFakeCluster(2); // HEAD is at v4
waitForState("version:\\d+ distributor:10 storage:10");
}
+ @Test
+ public void implicit_activation_for_nodes_that_return_not_found_for_version_activation_rpc() throws Exception {
+ setUpFakeCluster(3); // HEAD is at v4
+ waitForState("version:\\d+ distributor:10 storage:10");
+ }
+
+ // TODO partial version setup for simulating upgrades
+
}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/SlobrokTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/SlobrokTest.java
index 1de5848fc06..209da523705 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/SlobrokTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/SlobrokTest.java
@@ -33,7 +33,7 @@ public class SlobrokTest extends FleetControllerTest {
@Test
public void testSingleSlobrokRestart() throws Exception {
startingTest("SlobrokTest::testSingleSlobrokRestart");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
options.nodeStateRequestTimeoutMS = 60 * 60 * 1000;
options.maxSlobrokDisconnectGracePeriod = 60 * 60 * 1000;
setUpFleetController(true, options);
@@ -87,7 +87,7 @@ public class SlobrokTest extends FleetControllerTest {
@Test
public void testNodeTooLongOutOfSlobrok() throws Exception {
startingTest("SlobrokTest::testNodeTooLongOutOfSlobrok");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
options.maxSlobrokDisconnectGracePeriod = 60 * 1000;
options.nodeStateRequestTimeoutMS = 10000 * 60 * 1000;
setUpFleetController(true, options);
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java
index 32de3591f2d..ca246b3549f 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java
@@ -95,7 +95,7 @@ public class StateChangeTest extends FleetControllerTest {
@Test
public void testNormalStartup() throws Exception {
- FleetControllerOptions options = new FleetControllerOptions("mycluster", createNodes(10));
+ FleetControllerOptions options = defaultOptions("mycluster", createNodes(10));
options.maxInitProgressTime = 50000;
initialize(options);
@@ -167,10 +167,13 @@ public class StateChangeTest extends FleetControllerTest {
@Test
public void testNodeGoingDownAndUp() throws Exception {
- FleetControllerOptions options = new FleetControllerOptions("mycluster", createNodes(10));
+ FleetControllerOptions options = defaultOptions("mycluster", createNodes(10));
options.nodeStateRequestTimeoutMS = 60 * 60 * 1000;
options.minTimeBetweenNewSystemStates = 0;
options.maxInitProgressTime = 50000;
+ // This test makes very specific assumptions about the amount of work done in a single tick.
+ // Two-phase cluster state activation changes this quite a bit, so disable it. At least for now.
+ options.enableTwoPhaseClusterStateActivation = false;
initialize(options);
@@ -254,7 +257,7 @@ public class StateChangeTest extends FleetControllerTest {
@Test
public void testNodeGoingDownAndUpNotifying() throws Exception {
// Same test as above, but node manages to notify why it is going down first.
- FleetControllerOptions options = new FleetControllerOptions("mycluster", createNodes(10));
+ FleetControllerOptions options = defaultOptions("mycluster", createNodes(10));
options.nodeStateRequestTimeoutMS = 60 * 60 * 1000;
options.maxSlobrokDisconnectGracePeriod = 100000;
@@ -325,7 +328,7 @@ public class StateChangeTest extends FleetControllerTest {
@Test
public void testNodeGoingDownAndUpFast() throws Exception {
- FleetControllerOptions options = new FleetControllerOptions("mycluster", createNodes(10));
+ FleetControllerOptions options = defaultOptions("mycluster", createNodes(10));
options.maxSlobrokDisconnectGracePeriod = 60 * 1000;
initialize(options);
@@ -366,7 +369,7 @@ public class StateChangeTest extends FleetControllerTest {
@Test
public void testMaintenanceWhileNormalStorageNodeRestart() throws Exception {
- FleetControllerOptions options = new FleetControllerOptions("mycluster", createNodes(10));
+ FleetControllerOptions options = defaultOptions("mycluster", createNodes(10));
options.maxSlobrokDisconnectGracePeriod = 60 * 1000;
initialize(options);
@@ -426,7 +429,7 @@ public class StateChangeTest extends FleetControllerTest {
nodes.add(new ConfiguredNode(i, retired));
}
- FleetControllerOptions options = new FleetControllerOptions("mycluster", nodes);
+ FleetControllerOptions options = defaultOptions("mycluster", nodes);
options.maxSlobrokDisconnectGracePeriod = 60 * 1000;
initialize(options);
@@ -485,7 +488,7 @@ public class StateChangeTest extends FleetControllerTest {
nodes.add(new ConfiguredNode(i, retired));
}
- FleetControllerOptions options = new FleetControllerOptions("mycluster", nodes);
+ FleetControllerOptions options = defaultOptions("mycluster", nodes);
options.maxSlobrokDisconnectGracePeriod = 60 * 1000;
initialize(options);
@@ -508,7 +511,7 @@ public class StateChangeTest extends FleetControllerTest {
@Test
public void testDownNodeInitializing() throws Exception {
// Actually report initializing state if node has been down steadily for a while
- FleetControllerOptions options = new FleetControllerOptions("mycluster", createNodes(10));
+ FleetControllerOptions options = defaultOptions("mycluster", createNodes(10));
options.maxTransitionTime.put(NodeType.STORAGE, 5000);
options.maxInitProgressTime = 5000;
options.stableStateTimePeriod = 20000;
@@ -571,7 +574,7 @@ public class StateChangeTest extends FleetControllerTest {
@Test
public void testNodeInitializationStalled() throws Exception {
// Node should eventually be marked down, and not become initializing next time, but stay down until up
- FleetControllerOptions options = new FleetControllerOptions("mycluster", createNodes(10));
+ FleetControllerOptions options = defaultOptions("mycluster", createNodes(10));
options.maxTransitionTime.put(NodeType.STORAGE, 5000);
options.maxInitProgressTime = 5000;
options.stableStateTimePeriod = 1000000;
@@ -657,7 +660,7 @@ public class StateChangeTest extends FleetControllerTest {
@Test
public void testBackwardsInitializationProgress() throws Exception {
// Same as stalled. Mark down, keep down until up
- FleetControllerOptions options = new FleetControllerOptions("mycluster", createNodes(10));
+ FleetControllerOptions options = defaultOptions("mycluster", createNodes(10));
options.maxTransitionTime.put(NodeType.STORAGE, 5000);
options.maxInitProgressTime = 5000;
options.stableStateTimePeriod = 1000000;
@@ -700,7 +703,7 @@ public class StateChangeTest extends FleetControllerTest {
@Test
public void testNodeGoingDownWhileInitializing() throws Exception {
// Same as stalled. Mark down, keep down until up
- FleetControllerOptions options = new FleetControllerOptions("mycluster", createNodes(10));
+ FleetControllerOptions options = defaultOptions("mycluster", createNodes(10));
options.maxTransitionTime.put(NodeType.STORAGE, 5000);
options.maxInitProgressTime = 5000;
options.stableStateTimePeriod = 1000000;
@@ -759,7 +762,7 @@ public class StateChangeTest extends FleetControllerTest {
public void testContinuousCrashRightAfterInit() throws Exception {
startingTest("StateChangeTest::testContinuousCrashRightAfterInit");
// If node does this too many times, take it out of service
- FleetControllerOptions options = new FleetControllerOptions("mycluster", createNodes(10));
+ FleetControllerOptions options = defaultOptions("mycluster", createNodes(10));
options.maxTransitionTime.put(NodeType.STORAGE, 5000);
options.maxInitProgressTime = 5000;
options.maxPrematureCrashes = 2;
@@ -813,7 +816,7 @@ public class StateChangeTest extends FleetControllerTest {
public void testClusterStateMinNodes() throws Exception {
startingTest("StateChangeTest::testClusterStateMinNodes");
// If node does this too many times, take it out of service
- FleetControllerOptions options = new FleetControllerOptions("mycluster", createNodes(10));
+ FleetControllerOptions options = defaultOptions("mycluster", createNodes(10));
options.maxTransitionTime.put(NodeType.STORAGE, 0);
options.maxInitProgressTime = 0;
options.minDistributorNodesUp = 6;
@@ -868,7 +871,7 @@ public class StateChangeTest extends FleetControllerTest {
public void testClusterStateMinFactor() throws Exception {
startingTest("StateChangeTest::testClusterStateMinFactor");
// If node does this too many times, take it out of service
- FleetControllerOptions options = new FleetControllerOptions("mycluster", createNodes(10));
+ FleetControllerOptions options = defaultOptions("mycluster", createNodes(10));
options.maxTransitionTime.put(NodeType.STORAGE, 0);
options.maxInitProgressTime = 0;
options.minDistributorNodesUp = 0;
@@ -941,7 +944,7 @@ public class StateChangeTest extends FleetControllerTest {
@Test
public void testNoSystemStateBeforeInitialTimePeriod() throws Exception {
startingTest("StateChangeTest::testNoSystemStateBeforeInitialTimePeriod()");
- FleetControllerOptions options = new FleetControllerOptions("mycluster", createNodes(10));
+ FleetControllerOptions options = defaultOptions("mycluster", createNodes(10));
options.minTimeBeforeFirstSystemStateBroadcast = 3 * 60 * 1000;
setUpSystem(true, options);
setUpVdsNodes(true, new DummyVdsNodeOptions(), true);
@@ -982,7 +985,7 @@ public class StateChangeTest extends FleetControllerTest {
@Test
public void testSystemStateSentWhenNodesReplied() throws Exception {
startingTest("StateChangeTest::testSystemStateSentWhenNodesReplied()");
- final FleetControllerOptions options = new FleetControllerOptions("mycluster", createNodes(10));
+ final FleetControllerOptions options = defaultOptions("mycluster", createNodes(10));
options.minTimeBeforeFirstSystemStateBroadcast = 300 * 60 * 1000;
setUpSystem(true, options);
@@ -1016,7 +1019,7 @@ public class StateChangeTest extends FleetControllerTest {
@Test
public void testDontTagFailingSetSystemStateOk() throws Exception {
startingTest("StateChangeTest::testDontTagFailingSetSystemStateOk()");
- FleetControllerOptions options = new FleetControllerOptions("mycluster", createNodes(10));
+ FleetControllerOptions options = defaultOptions("mycluster", createNodes(10));
setUpFleetController(true, options);
setUpVdsNodes(true, new DummyVdsNodeOptions());
waitForStableSystem();
@@ -1035,7 +1038,7 @@ public class StateChangeTest extends FleetControllerTest {
// Assert that the failed node has not acknowledged the latest version.
// (The version may still be larger than versionBeforeChange if the fleet controller sends a
// "stable system" update without timestamps in the meantime
- assertTrue(fleetController.getCluster().getNodeInfo(nodes.get(1).getNode()).getSystemStateVersionAcknowledged() < versionAfterChange);
+ assertTrue(fleetController.getCluster().getNodeInfo(nodes.get(1).getNode()).getClusterStateVersionBundleAcknowledged() < versionAfterChange);
// Ensure non-concurrent access to getNewestSystemStateVersionSent
synchronized(timer) {
@@ -1047,7 +1050,7 @@ public class StateChangeTest extends FleetControllerTest {
@Test
public void testAlteringDistributionSplitCount() throws Exception {
startingTest("StateChangeTest::testAlteringDistributionSplitCount");
- FleetControllerOptions options = new FleetControllerOptions("mycluster", createNodes(10));
+ FleetControllerOptions options = defaultOptions("mycluster", createNodes(10));
options.distributionBits = 17;
initialize(options);
@@ -1094,7 +1097,7 @@ public class StateChangeTest extends FleetControllerTest {
@Test
public void testSetAllTimestampsAfterDowntime() throws Exception {
startingTest("StateChangeTest::testSetAllTimestampsAfterDowntime");
- FleetControllerOptions options = new FleetControllerOptions("mycluster", createNodes(10));
+ FleetControllerOptions options = defaultOptions("mycluster", createNodes(10));
setUpFleetController(true, options);
setUpVdsNodes(true, new DummyVdsNodeOptions());
waitForStableSystem();
@@ -1143,7 +1146,7 @@ public class StateChangeTest extends FleetControllerTest {
@Test
public void consolidated_cluster_state_reflects_node_changes_when_cluster_is_down() throws Exception {
- FleetControllerOptions options = new FleetControllerOptions("mycluster", createNodes(10));
+ FleetControllerOptions options = defaultOptions("mycluster", createNodes(10));
options.maxTransitionTime.put(NodeType.STORAGE, 0);
options.minStorageNodesUp = 10;
options.minDistributorNodesUp = 10;
@@ -1177,7 +1180,7 @@ public class StateChangeTest extends FleetControllerTest {
// of previous timer invocations (with subsequent state generation) would not be visible.
@Test
public void timer_events_during_cluster_down_observe_most_recent_node_changes() throws Exception {
- FleetControllerOptions options = new FleetControllerOptions("mycluster", createNodes(10));
+ FleetControllerOptions options = defaultOptions("mycluster", createNodes(10));
options.maxTransitionTime.put(NodeType.STORAGE, 1000);
options.minStorageNodesUp = 10;
options.minDistributorNodesUp = 10;
@@ -1209,7 +1212,7 @@ public class StateChangeTest extends FleetControllerTest {
@Test
public void do_not_emit_multiple_events_when_node_state_does_not_match_versioned_state() throws Exception {
- FleetControllerOptions options = new FleetControllerOptions("mycluster", createNodes(10));
+ FleetControllerOptions options = defaultOptions("mycluster", createNodes(10));
initialize(options);
ctrl.tick();
@@ -1343,11 +1346,19 @@ public class StateChangeTest extends FleetControllerTest {
void sendAllDeferredDistributorClusterStateAcks() throws Exception {
communicator.sendAllDeferredDistributorClusterStateAcks();
- ctrl.tick();
+ ctrl.tick(); // Process cluster state bundle ACKs
+ if (ctrl.getOptions().enableTwoPhaseClusterStateActivation) {
+ ctrl.tick(); // Send activations
+ ctrl.tick(); // Process activation ACKs
+ }
}
void processScheduledTask() throws Exception {
ctrl.tick(); // Cluster state recompute iteration and send
+ if (ctrl.getOptions().enableTwoPhaseClusterStateActivation) {
+ ctrl.tick(); // Send activations
+ ctrl.tick(); // Process activation ACKs
+ }
ctrl.tick(); // Iff ACKs were received, process version dependent task(s)
}
@@ -1373,11 +1384,11 @@ public class StateChangeTest extends FleetControllerTest {
}
private static FleetControllerOptions defaultOptions() {
- return new FleetControllerOptions("mycluster", createNodes(10));
+ return defaultOptions("mycluster", createNodes(10));
}
private static FleetControllerOptions optionsWithZeroTransitionTime() {
- FleetControllerOptions options = new FleetControllerOptions("mycluster", createNodes(10));
+ FleetControllerOptions options = defaultOptions("mycluster", createNodes(10));
options.maxTransitionTime.put(NodeType.STORAGE, 0);
return options;
}
@@ -1440,7 +1451,7 @@ public class StateChangeTest extends FleetControllerTest {
@Test
public void no_op_synchronous_remote_task_waits_until_current_state_is_acked() throws Exception {
- RemoteTaskFixture fixture = createFixtureWith(optionsWithZeroTransitionTime());
+ RemoteTaskFixture fixture = createFixtureWith(optionsWithZeroTransitionTime());
communicator.setShouldDeferDistributorClusterStateAcks(true);
fixture.markStorageNodeDown(0);
@@ -1524,6 +1535,7 @@ public class StateChangeTest extends FleetControllerTest {
FleetControllerOptions options = defaultOptions();
options.minTimeBetweenNewSystemStates = 10_000;
RemoteTaskFixture fixture = createFixtureWith(options);
+
// Have to increment timer here to be able to send state generated by the scheduled task
timer.advanceTime(10_000);
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateGatherTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateGatherTest.java
index f208003e46e..007bda04f4e 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateGatherTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateGatherTest.java
@@ -27,7 +27,7 @@ public class StateGatherTest extends FleetControllerTest {
public void testAlwaysHavePendingGetNodeStateRequestTowardsNodes() throws Exception {
Logger.getLogger(NodeStateGatherer.class.getName()).setLevel(LogLevel.SPAM);
startingTest("StateGatherTest::testOverlappingGetNodeStateRequests");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
options.nodeStateRequestTimeoutMS = 10 * 60 * 1000;
// Force actual message timeout to be lower than request timeout.
options.nodeStateRequestTimeoutEarliestPercentage = 80;
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StatusPagesTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StatusPagesTest.java
index 45f11a2a0f3..fc18a1b0b8b 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StatusPagesTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StatusPagesTest.java
@@ -78,7 +78,7 @@ public class StatusPagesTest extends FleetControllerTest {
@Test
public void testStatusThroughContainer() throws Exception {
startingTest("StatusPagesTest::testStatusThroughContainer()");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
options.setStorageDistribution(new Distribution(Distribution.getDefaultDistributionConfig(3, 10)));
final StatusHandler.ContainerStatusPageServer statusServer = new StatusHandler.ContainerStatusPageServer();
setUpFleetController(true, options, true, statusServer);
@@ -176,7 +176,7 @@ public class StatusPagesTest extends FleetControllerTest {
// Set this to true temporary if you want to check status page from browser. Should be false in checked in code always.
boolean haltTestToViewStatusPage = false;
startingTest("StatusPagesTest::testSimpleConnectionWithSomeContent()");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
options.setStorageDistribution(new Distribution(Distribution.getDefaultDistributionConfig(3, 10)));
//options.minRatioOfStorageNodesUp = 0.99;
if (haltTestToViewStatusPage) {
@@ -211,7 +211,7 @@ public class StatusPagesTest extends FleetControllerTest {
@Test
public void testNodePage() throws Exception {
startingTest("StatusPagesTest::testNodePage()");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
options.setStorageDistribution(new Distribution(Distribution.getDefaultDistributionConfig(3, 10)));
setUpFleetController(true, options);
setUpVdsNodes(true, new DummyVdsNodeOptions());
@@ -231,7 +231,7 @@ public class StatusPagesTest extends FleetControllerTest {
@Test
public void testErrorResponseCode() throws Exception {
startingTest("StatusPagesTest::testNodePage()");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
options.setStorageDistribution(new Distribution(Distribution.getDefaultDistributionConfig(3, 10)));
setUpFleetController(true, options);
setUpVdsNodes(true, new DummyVdsNodeOptions());
@@ -348,7 +348,7 @@ public class StatusPagesTest extends FleetControllerTest {
@Test
public void testStateServing() throws Exception {
startingTest("StatusPagesTest::testStateServing()");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
setUpFleetController(true, options);
fleetController.updateOptions(options, 5);
waitForCompleteCycle();
@@ -376,7 +376,7 @@ public class StatusPagesTest extends FleetControllerTest {
@Test
public void testClusterStateServing() throws Exception {
startingTest("StatusPagesTest::testClusterStateServing()");
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ FleetControllerOptions options = defaultOptions("mycluster");
setUpFleetController(true, options);
fleetController.updateOptions(options, 5);
waitForCompleteCycle();
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/SystemStateBroadcasterTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/SystemStateBroadcasterTest.java
index aa5219147ce..f99df6a25b2 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/SystemStateBroadcasterTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/SystemStateBroadcasterTest.java
@@ -6,12 +6,17 @@ import com.yahoo.vespa.clustercontroller.core.database.DatabaseHandler;
import com.yahoo.vespa.clustercontroller.core.listeners.NodeAddedOrRemovedListener;
import com.yahoo.vespa.clustercontroller.core.listeners.NodeStateOrHostInfoChangeHandler;
import org.junit.Test;
+import org.mockito.ArgumentCaptor;
import java.util.stream.Stream;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertNull;
import static org.mockito.Matchers.any;
import static org.mockito.Matchers.eq;
import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
public class SystemStateBroadcasterTest {
@@ -21,6 +26,8 @@ public class SystemStateBroadcasterTest {
final Object monitor = new Object();
SystemStateBroadcaster broadcaster = new SystemStateBroadcaster(timer, monitor);
Communicator mockCommunicator = mock(Communicator.class);
+ DatabaseHandler mockDatabaseHandler = mock(DatabaseHandler.class);
+ FleetController mockFleetController = mock(FleetController.class);
void simulateNodePartitionedAwaySilently(ClusterFixture cf) {
cf.cluster().getNodeInfo(Node.ofStorage(0)).setStartTimestamp(600);
@@ -32,6 +39,18 @@ public class SystemStateBroadcasterTest {
cf.cluster().getNodeInfo(Node.ofDistributor(0)).setReportedState(new NodeState(NodeType.DISTRIBUTOR, State.DOWN).setStartTimestamp(500), 2000);
cf.cluster().getNodeInfo(Node.ofDistributor(0)).setReportedState(new NodeState(NodeType.DISTRIBUTOR, State.UP).setStartTimestamp(500), 3000);
}
+
+ void simulateBroadcastTick(ClusterFixture cf) {
+ broadcaster.processResponses();
+ broadcaster.broadcastNewStateBundleIfRequired(dbContextFrom(cf.cluster()), mockCommunicator);
+ try {
+ broadcaster.checkIfClusterStateIsAckedByAllDistributors(
+ mockDatabaseHandler, dbContextFrom(cf.cluster()), mockFleetController);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ broadcaster.broadcastStateActivationsIfRequired(dbContextFrom(cf.cluster()), mockCommunicator);
+ }
}
private static DatabaseHandler.Context dbContextFrom(ContentCluster cluster) {
@@ -68,7 +87,7 @@ public class SystemStateBroadcasterTest {
ClusterStateBundle stateBundle = ClusterStateBundleUtil.makeBundle("distributor:2 storage:2");
ClusterFixture cf = ClusterFixture.forFlatCluster(2).bringEntireClusterUp().assignDummyRpcAddresses();
f.broadcaster.handleNewClusterStates(stateBundle);
- f.broadcaster.broadcastNewState(dbContextFrom(cf.cluster()), f.mockCommunicator);
+ f.broadcaster.broadcastNewStateBundleIfRequired(dbContextFrom(cf.cluster()), f.mockCommunicator);
cf.cluster().getNodeInfo().forEach(nodeInfo -> verify(f.mockCommunicator).setSystemState(eq(stateBundle), eq(nodeInfo), any()));
}
@@ -79,7 +98,7 @@ public class SystemStateBroadcasterTest {
ClusterFixture cf = ClusterFixture.forFlatCluster(2).bringEntireClusterUp().assignDummyRpcAddresses();
f.simulateNodePartitionedAwaySilently(cf);
f.broadcaster.handleNewClusterStates(stateBundle);
- f.broadcaster.broadcastNewState(dbContextFrom(cf.cluster()), f.mockCommunicator);
+ f.broadcaster.broadcastNewStateBundleIfRequired(dbContextFrom(cf.cluster()), f.mockCommunicator);
clusterNodeInfos(cf.cluster(), Node.ofDistributor(1), Node.ofStorage(0), Node.ofStorage(1)).forEach(nodeInfo -> {
// Only distributor 0 should observe startup timestamps
@@ -97,7 +116,7 @@ public class SystemStateBroadcasterTest {
StateMapping.of("upsidedown", "distributor:2 .0.s:d storage:2"));
ClusterFixture cf = ClusterFixture.forFlatCluster(2).bringEntireClusterUp().assignDummyRpcAddresses();
f.broadcaster.handleNewClusterStates(stateBundle);
- f.broadcaster.broadcastNewState(dbContextFrom(cf.cluster()), f.mockCommunicator);
+ f.broadcaster.broadcastNewStateBundleIfRequired(dbContextFrom(cf.cluster()), f.mockCommunicator);
cf.cluster().getNodeInfo().forEach(nodeInfo -> verify(f.mockCommunicator).setSystemState(eq(stateBundle), eq(nodeInfo), any()));
}
@@ -111,7 +130,7 @@ public class SystemStateBroadcasterTest {
ClusterFixture cf = ClusterFixture.forFlatCluster(2).bringEntireClusterUp().assignDummyRpcAddresses();
f.simulateNodePartitionedAwaySilently(cf);
f.broadcaster.handleNewClusterStates(stateBundle);
- f.broadcaster.broadcastNewState(dbContextFrom(cf.cluster()), f.mockCommunicator);
+ f.broadcaster.broadcastNewStateBundleIfRequired(dbContextFrom(cf.cluster()), f.mockCommunicator);
clusterNodeInfos(cf.cluster(), Node.ofDistributor(1), Node.ofStorage(0), Node.ofStorage(1)).forEach(nodeInfo -> {
// Only distributor 0 should observe startup timestamps
@@ -122,4 +141,200 @@ public class SystemStateBroadcasterTest {
StateMapping.of("upsidedown", "distributor:2 .0.s:d storage:2 .0.t:600 .1.t:700"));
verify(f.mockCommunicator).setSystemState(eq(expectedDistr0Bundle), eq(cf.cluster().getNodeInfo(Node.ofDistributor(0))), any());
}
+
+ private static class MockSetClusterStateRequest extends SetClusterStateRequest {
+ public MockSetClusterStateRequest(NodeInfo nodeInfo, int clusterStateVersion) {
+ super(nodeInfo, clusterStateVersion);
+ }
+ }
+
+ private static class MockActivateClusterStateVersionRequest extends ActivateClusterStateVersionRequest {
+ public MockActivateClusterStateVersionRequest(NodeInfo nodeInfo, int systemStateVersion) {
+ super(nodeInfo, systemStateVersion);
+ }
+ }
+
+ private static void respondToSetClusterStateBundle(NodeInfo nodeInfo,
+ ClusterStateBundle stateBundle,
+ Communicator.Waiter<SetClusterStateRequest> waiter) {
+ // Have to patch in that we've actually sent the bundle in the first place...
+ nodeInfo.setClusterStateVersionBundleSent(stateBundle);
+
+ var req = new MockSetClusterStateRequest(nodeInfo, stateBundle.getVersion());
+ req.setReply(new ClusterStateVersionSpecificRequest.Reply());
+ waiter.done(req);
+ }
+
+ private static void respondToActivateClusterStateVersion(NodeInfo nodeInfo,
+ ClusterStateBundle stateBundle,
+ int actualVersion,
+ Communicator.Waiter<ActivateClusterStateVersionRequest> waiter) {
+ // Have to patch in that we've actually sent the bundle in the first place...
+ nodeInfo.setClusterStateVersionActivationSent(stateBundle.getVersion());
+
+ var req = new MockActivateClusterStateVersionRequest(nodeInfo, stateBundle.getVersion());
+ req.setReply(ClusterStateVersionSpecificRequest.Reply.withActualVersion(actualVersion));
+ waiter.done(req);
+ }
+
+ private static void respondToActivateClusterStateVersion(NodeInfo nodeInfo,
+ ClusterStateBundle stateBundle,
+ Communicator.Waiter<ActivateClusterStateVersionRequest> waiter) {
+ respondToActivateClusterStateVersion(nodeInfo, stateBundle, stateBundle.getVersion(), waiter);
+ }
+
+ private static class StateActivationFixture extends Fixture {
+ ClusterStateBundle stateBundle;
+ ClusterFixture cf;
+
+ @SuppressWarnings("rawtypes") // Java generics <3
+ final ArgumentCaptor<Communicator.Waiter> d0Waiter;
+ @SuppressWarnings("rawtypes")
+ final ArgumentCaptor<Communicator.Waiter> d1Waiter;
+
+ private StateActivationFixture(boolean enableDeferred) {
+ super();
+ stateBundle = ClusterStateBundleUtil
+ .makeBundleBuilder("version:123 distributor:2 storage:2")
+ .deferredActivation(enableDeferred)
+ .deriveAndBuild();
+ cf = ClusterFixture.forFlatCluster(2).bringEntireClusterUp().assignDummyRpcAddresses();
+ broadcaster.handleNewClusterStates(stateBundle);
+ broadcaster.broadcastNewStateBundleIfRequired(dbContextFrom(cf.cluster()), mockCommunicator);
+
+ d0Waiter = ArgumentCaptor.forClass(Communicator.Waiter.class);
+ d1Waiter = ArgumentCaptor.forClass(Communicator.Waiter.class);
+ }
+
+ @SuppressWarnings("unchecked") // Type erasure of Waiter in mocked argument capture
+ void expectSetSystemStateInvocationsToBothDistributors() {
+ clusterNodeInfos(cf.cluster(), Node.ofDistributor(0), Node.ofDistributor(1)).forEach(nodeInfo -> {
+ verify(mockCommunicator).setSystemState(eq(stateBundle), eq(nodeInfo),
+ (nodeInfo.getNodeIndex() == 0 ? d0Waiter : d1Waiter).capture());
+ });
+ }
+
+ @SuppressWarnings("unchecked") // Type erasure of Waiter in mocked argument capture
+ void ackStateBundleFromBothDistributors() {
+ expectSetSystemStateInvocationsToBothDistributors();
+ simulateBroadcastTick(cf);
+
+ respondToSetClusterStateBundle(cf.cluster.getNodeInfo(Node.ofDistributor(0)), stateBundle, d0Waiter.getValue());
+ respondToSetClusterStateBundle(cf.cluster.getNodeInfo(Node.ofDistributor(1)), stateBundle, d1Waiter.getValue());
+ simulateBroadcastTick(cf);
+ }
+
+ static StateActivationFixture withTwoPhaseEnabled() {
+ return new StateActivationFixture(true);
+ }
+
+ static StateActivationFixture withTwoPhaseDisabled() {
+ return new StateActivationFixture(false);
+ }
+ }
+
+ @Test
+ @SuppressWarnings("unchecked") // Type erasure of Waiter in mocked argument capture
+ public void activation_not_sent_before_all_distributors_have_acked_state_bundle() {
+ var f = StateActivationFixture.withTwoPhaseEnabled();
+ var cf = f.cf;
+
+ f.expectSetSystemStateInvocationsToBothDistributors();
+ f.simulateBroadcastTick(cf);
+
+ // Respond from distributor 0, but not yet from distributor 1
+ respondToSetClusterStateBundle(cf.cluster.getNodeInfo(Node.ofDistributor(0)), f.stateBundle, f.d0Waiter.getValue());
+ f.simulateBroadcastTick(cf);
+
+ // No activations should be sent yet
+ cf.cluster().getNodeInfo().forEach(nodeInfo -> {
+ verify(f.mockCommunicator, times(0)).activateClusterStateVersion(eq(123), eq(nodeInfo), any());
+ });
+ assertNull(f.broadcaster.getLastClusterStateBundleConverged());
+
+ respondToSetClusterStateBundle(cf.cluster.getNodeInfo(Node.ofDistributor(1)), f.stateBundle, f.d1Waiter.getValue());
+ f.simulateBroadcastTick(cf);
+
+ // Activation should now be sent to _all_ nodes (distributor and storage)
+ cf.cluster().getNodeInfo().forEach(nodeInfo -> {
+ verify(f.mockCommunicator).activateClusterStateVersion(eq(123), eq(nodeInfo), any());
+ });
+ // But not converged yet, as activations have not been ACKed
+ assertNull(f.broadcaster.getLastClusterStateBundleConverged());
+ }
+
+ @Test
+ @SuppressWarnings("unchecked") // Type erasure of Waiter in mocked argument capture
+ public void state_bundle_not_considered_converged_until_activation_acked_by_all_distributors() {
+ var f = StateActivationFixture.withTwoPhaseEnabled();
+ var cf = f.cf;
+
+ f.ackStateBundleFromBothDistributors();
+
+ final var d0ActivateWaiter = ArgumentCaptor.forClass(Communicator.Waiter.class);
+ final var d1ActivateWaiter = ArgumentCaptor.forClass(Communicator.Waiter.class);
+
+ clusterNodeInfos(cf.cluster(), Node.ofDistributor(0), Node.ofDistributor(1)).forEach(nodeInfo -> {
+ verify(f.mockCommunicator).activateClusterStateVersion(eq(123), eq(nodeInfo),
+ (nodeInfo.getNodeIndex() == 0 ? d0ActivateWaiter : d1ActivateWaiter).capture());
+ });
+
+ respondToActivateClusterStateVersion(cf.cluster.getNodeInfo(Node.ofDistributor(0)),
+ f.stateBundle, d0ActivateWaiter.getValue());
+ f.simulateBroadcastTick(cf);
+
+ assertNull(f.broadcaster.getLastClusterStateBundleConverged()); // Not yet converged
+
+ respondToActivateClusterStateVersion(cf.cluster.getNodeInfo(Node.ofDistributor(1)),
+ f.stateBundle, d1ActivateWaiter.getValue());
+ f.simulateBroadcastTick(cf);
+
+ // Finally, all distributors have ACKed the version! State is marked as converged.
+ assertEquals(f.stateBundle, f.broadcaster.getLastClusterStateBundleConverged());
+ }
+
+ @Test
+ @SuppressWarnings("unchecked") // Type erasure of Waiter in mocked argument capture
+ public void activation_not_sent_if_deferred_activation_is_disabled_in_state_bundle() {
+ var f = StateActivationFixture.withTwoPhaseDisabled();
+ var cf = f.cf;
+
+ f.ackStateBundleFromBothDistributors();
+
+ // At this point the cluster state shall be considered converged.
+ assertEquals(f.stateBundle, f.broadcaster.getLastClusterStateBundleConverged());
+
+ // No activations shall have been sent.
+ clusterNodeInfos(cf.cluster(), Node.ofDistributor(0), Node.ofDistributor(1)).forEach(nodeInfo -> {
+ verify(f.mockCommunicator, times(0)).activateClusterStateVersion(eq(123), eq(nodeInfo), any());
+ });
+ }
+
+ @Test
+ @SuppressWarnings("unchecked") // Type erasure of Waiter in mocked argument capture
+ public void activation_convergence_considers_actual_version_returned_from_node() {
+ var f = StateActivationFixture.withTwoPhaseEnabled();
+ var cf = f.cf;
+
+ f.ackStateBundleFromBothDistributors();
+
+ final var d0ActivateWaiter = ArgumentCaptor.forClass(Communicator.Waiter.class);
+ final var d1ActivateWaiter = ArgumentCaptor.forClass(Communicator.Waiter.class);
+
+ clusterNodeInfos(cf.cluster(), Node.ofDistributor(0), Node.ofDistributor(1)).forEach(nodeInfo -> {
+ verify(f.mockCommunicator).activateClusterStateVersion(eq(123), eq(nodeInfo),
+ (nodeInfo.getNodeIndex() == 0 ? d0ActivateWaiter : d1ActivateWaiter).capture());
+ });
+
+ respondToActivateClusterStateVersion(cf.cluster.getNodeInfo(Node.ofDistributor(0)),
+ f.stateBundle, d0ActivateWaiter.getValue());
+ // Distributor 1 reports higher actual version, should not cause this version to be
+ // considered converged since it's not an exact version match.
+ respondToActivateClusterStateVersion(cf.cluster.getNodeInfo(Node.ofDistributor(1)),
+ f.stateBundle, 124, d1ActivateWaiter.getValue());
+ f.simulateBroadcastTick(cf);
+
+ assertNull(f.broadcaster.getLastClusterStateBundleConverged());
+ }
+
}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/WantedStateTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/WantedStateTest.java
index 6e5b40aa7d9..bc7ee6adee1 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/WantedStateTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/WantedStateTest.java
@@ -10,7 +10,7 @@ public class WantedStateTest extends FleetControllerTest {
@Test
public void testSettingStorageNodeMaintenanceAndBack() throws Exception {
startingTest("WantedStateTest::testSettingStorageNodeMaintenanceAndBack()");
- setUpFleetController(true, new FleetControllerOptions("mycluster"));
+ setUpFleetController(true, defaultOptions("mycluster"));
setUpVdsNodes(true, new DummyVdsNodeOptions());
waitForStableSystem();
@@ -24,7 +24,7 @@ public class WantedStateTest extends FleetControllerTest {
@Test
public void testOverridingWantedStateOtherReason() throws Exception {
startingTest("WantedStateTest::testOverridingWantedStateOtherReason()");
- setUpFleetController(true, new FleetControllerOptions("mycluster"));
+ setUpFleetController(true, defaultOptions("mycluster"));
setUpVdsNodes(true, new DummyVdsNodeOptions());
waitForStableSystem();
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/rpc/RPCCommunicatorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/rpc/RPCCommunicatorTest.java
index 7602f0c83a2..9eb98f4f045 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/rpc/RPCCommunicatorTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/rpc/RPCCommunicatorTest.java
@@ -99,7 +99,7 @@ public class RPCCommunicatorTest {
(RequestWaiter)any());
}
- private static class Fixture {
+ private static class Fixture<RequestType> {
final Supervisor mockSupervisor = mock(Supervisor.class);
final Target mockTarget = mock(Target.class);
final Timer timer = new FakeTimer();
@@ -107,7 +107,7 @@ public class RPCCommunicatorTest {
final AtomicReference<Request> receivedRequest = new AtomicReference<>();
final AtomicReference<RequestWaiter> receivedWaiter = new AtomicReference<>();
@SuppressWarnings("unchecked") // Cannot mock with "compiler-obvious" type safety for generics
- final Communicator.Waiter<SetClusterStateRequest> mockWaiter = mock(Communicator.Waiter.class);
+ final Communicator.Waiter<RequestType> mockWaiter = mock(Communicator.Waiter.class);
Fixture() {
communicator = new RPCCommunicator(
@@ -131,9 +131,9 @@ public class RPCCommunicatorTest {
@Test
public void setSystemState_v3_sends_distribution_states_rpc() {
- Fixture f = new Fixture();
- ClusterFixture cf = ClusterFixture.forFlatCluster(3).bringEntireClusterUp().assignDummyRpcAddresses();
- ClusterStateBundle sentBundle = ClusterStateBundleUtil.makeBundle("distributor:3 storage:3");
+ var f = new Fixture<SetClusterStateRequest>();
+ var cf = ClusterFixture.forFlatCluster(3).bringEntireClusterUp().assignDummyRpcAddresses();
+ var sentBundle = ClusterStateBundleUtil.makeBundle("distributor:3 storage:3");
f.communicator.setSystemState(sentBundle, cf.cluster().getNodeInfo(Node.ofStorage(1)), f.mockWaiter);
Request req = f.receivedRequest.get();
@@ -147,9 +147,9 @@ public class RPCCommunicatorTest {
@Test
public void set_distribution_states_v3_rpc_auto_downgrades_to_v2_on_unknown_method_error() {
- Fixture f = new Fixture();
- ClusterFixture cf = ClusterFixture.forFlatCluster(3).bringEntireClusterUp().assignDummyRpcAddresses();
- ClusterStateBundle sentBundle = ClusterStateBundleUtil.makeBundle("version:123 distributor:3 storage:3");
+ var f = new Fixture<SetClusterStateRequest>();
+ var cf = ClusterFixture.forFlatCluster(3).bringEntireClusterUp().assignDummyRpcAddresses();
+ var sentBundle = ClusterStateBundleUtil.makeBundle("version:123 distributor:3 storage:3");
f.communicator.setSystemState(sentBundle, cf.cluster().getNodeInfo(Node.ofStorage(1)), f.mockWaiter);
RequestWaiter waiter = f.receivedWaiter.get();
@@ -161,7 +161,7 @@ public class RPCCommunicatorTest {
waiter.handleRequestDone(req);
// This would normally be done in processResponses(), but that code path is not invoked in this test.
- cf.cluster().getNodeInfo(Node.ofStorage(1)).setSystemStateVersionAcknowledged(123, false);
+ cf.cluster().getNodeInfo(Node.ofStorage(1)).setClusterStateBundleVersionAcknowledged(123, false);
f.receivedRequest.set(null);
// Now when we try again, we should have been downgraded to the legacy setsystemstate2 RPC
@@ -171,4 +171,17 @@ public class RPCCommunicatorTest {
assertThat(req.methodName(), equalTo(RPCCommunicator.LEGACY_SET_SYSTEM_STATE2_RPC_METHOD_NAME));
}
+ @Test
+ public void activateClusterStateVersion_sends_version_activation_rpc() {
+ var f = new Fixture<ActivateClusterStateVersionRequest>();
+ var cf = ClusterFixture.forFlatCluster(3).bringEntireClusterUp().assignDummyRpcAddresses();
+ f.communicator.activateClusterStateVersion(12345, cf.cluster().getNodeInfo(Node.ofDistributor(1)), f.mockWaiter);
+
+ Request req = f.receivedRequest.get();
+ assertThat(req, notNullValue());
+ assertThat(req.methodName(), equalTo(RPCCommunicator.ACTIVATE_CLUSTER_STATE_VERSION_RPC_METHOD_NAME));
+ assertTrue(req.parameters().satisfies("i")); // <cluster state version>
+ assertThat(req.parameters().get(0).asInt32(), equalTo(12345));
+ }
+
}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/rpc/SlimeClusterStateBundleCodecTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/rpc/SlimeClusterStateBundleCodecTest.java
index b19b1d780bf..3dce1153685 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/rpc/SlimeClusterStateBundleCodecTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/rpc/SlimeClusterStateBundleCodecTest.java
@@ -72,4 +72,20 @@ public class SlimeClusterStateBundleCodecTest {
assertThat(roundtripEncodeWithEnvelope(stateBundle), equalTo(stateBundle));
}
+ @Test
+ public void can_roundtrip_encode_bundle_with_deferred_activation_enabled() {
+ var stateBundle = ClusterStateBundleUtil.makeBundleBuilder("distributor:2 storage:2")
+ .deferredActivation(true)
+ .deriveAndBuild();
+ assertThat(roundtripEncode(stateBundle), equalTo(stateBundle));
+ }
+
+ @Test
+ public void can_roundtrip_encode_bundle_with_deferred_activation_disabled() {
+ var stateBundle = ClusterStateBundleUtil.makeBundleBuilder("distributor:2 storage:2")
+ .deferredActivation(false)
+ .deriveAndBuild();
+ assertThat(roundtripEncode(stateBundle), equalTo(stateBundle));
+ }
+
}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/testutils/WaitCondition.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/testutils/WaitCondition.java
index d140ef998b6..9734156b13f 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/testutils/WaitCondition.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/testutils/WaitCondition.java
@@ -29,6 +29,7 @@ public interface WaitCondition {
abstract class StateWait implements WaitCondition {
private final Object monitor;
protected ClusterState currentState;
+ protected ClusterState convergedState;
private final SystemStateListener listener = new SystemStateListener() {
@Override
public void handleNewPublishedState(ClusterStateBundle state) {
@@ -37,6 +38,14 @@ public interface WaitCondition {
monitor.notifyAll();
}
}
+
+ @Override
+ public void handleStateConvergedInCluster(ClusterStateBundle states) {
+ synchronized (monitor) {
+ currentState = convergedState = states.getBaselineClusterState();
+ monitor.notifyAll();
+ }
+ }
};
public StateWait(FleetController fc, Object monitor) {
@@ -90,8 +99,8 @@ public interface WaitCondition {
@Override
public String isConditionMet() {
- if (currentState != null) {
- lastCheckedState = currentState;
+ if (convergedState != null) {
+ lastCheckedState = convergedState;
Matcher m = pattern.matcher(lastCheckedState.toString());
if (m.matches() || !checkSpaceSubset.isEmpty()) {
if (nodesToCheck != null) {
diff --git a/config-model/src/main/java/com/yahoo/searchdefinition/MapEvaluationTypeContext.java b/config-model/src/main/java/com/yahoo/searchdefinition/MapEvaluationTypeContext.java
index 0d9ea00bf73..a0f35dbefe6 100644
--- a/config-model/src/main/java/com/yahoo/searchdefinition/MapEvaluationTypeContext.java
+++ b/config-model/src/main/java/com/yahoo/searchdefinition/MapEvaluationTypeContext.java
@@ -37,6 +37,8 @@ public class MapEvaluationTypeContext extends FunctionReferenceContext implement
private final Map<Reference, TensorType> featureTypes = new HashMap<>();
+ private final Map<Reference, TensorType> resolvedTypes = new HashMap<>();
+
/** For invocation loop detection */
private final Deque<Reference> currentResolutionCallStack;
@@ -63,8 +65,24 @@ public class MapEvaluationTypeContext extends FunctionReferenceContext implement
throw new UnsupportedOperationException("Not able to parse gereral references from string form");
}
+ public void forgetResolvedTypes() {
+ resolvedTypes.clear();
+ }
+
@Override
public TensorType getType(Reference reference) {
+ // computeIfAbsent without concurrent modification due to resolve adding more resolved entries:
+ TensorType resolvedType = resolvedTypes.get(reference);
+ if (resolvedType != null) return resolvedType;
+
+ resolvedType = resolveType(reference);
+ if (resolvedType == null)
+ return defaultTypeOf(reference); // Don't store fallback to default as we may know more later
+ resolvedTypes.put(reference, resolvedType);
+ return resolvedType;
+ }
+
+ private TensorType resolveType(Reference reference) {
if (currentResolutionCallStack.contains(reference))
throw new IllegalArgumentException("Invocation loop: " +
currentResolutionCallStack.stream().map(Reference::toString).collect(Collectors.joining(" -> ")) +
@@ -90,7 +108,7 @@ public class MapEvaluationTypeContext extends FunctionReferenceContext implement
// The argument may be a local identifier bound to the actual value
String argument = reference.simpleArgument().get();
reference = Reference.simple(reference.name(), bindings.getOrDefault(argument, argument));
- return featureTypes.getOrDefault(reference, defaultTypeOf(reference));
+ return featureTypes.get(reference);
}
// A reference to a function?
diff --git a/config-model/src/main/java/com/yahoo/searchdefinition/RankProfile.java b/config-model/src/main/java/com/yahoo/searchdefinition/RankProfile.java
index bc49c40e4e1..b3853b36aa5 100644
--- a/config-model/src/main/java/com/yahoo/searchdefinition/RankProfile.java
+++ b/config-model/src/main/java/com/yahoo/searchdefinition/RankProfile.java
@@ -738,7 +738,7 @@ public class RankProfile implements Serializable, Cloneable {
* Creates a context containing the type information of all constants, attributes and query profiles
* referable from this rank profile.
*/
- public TypeContext<Reference> typeContext(QueryProfileRegistry queryProfiles) {
+ public MapEvaluationTypeContext typeContext(QueryProfileRegistry queryProfiles) {
MapEvaluationTypeContext context = new MapEvaluationTypeContext(getFunctions().values().stream()
.map(RankingExpressionFunction::function)
.collect(Collectors.toList()));
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/ml/ConvertedModel.java b/config-model/src/main/java/com/yahoo/vespa/model/ml/ConvertedModel.java
index 93848c067e0..f197e2dfe6d 100644
--- a/config-model/src/main/java/com/yahoo/vespa/model/ml/ConvertedModel.java
+++ b/config-model/src/main/java/com/yahoo/vespa/model/ml/ConvertedModel.java
@@ -12,6 +12,7 @@ import com.yahoo.io.IOUtils;
import com.yahoo.path.Path;
import com.yahoo.search.query.profile.QueryProfileRegistry;
import com.yahoo.searchdefinition.FeatureNames;
+import com.yahoo.searchdefinition.MapEvaluationTypeContext;
import com.yahoo.searchdefinition.RankProfile;
import com.yahoo.searchdefinition.RankingConstant;
import com.yahoo.searchdefinition.expressiontransforms.RankProfileTransformContext;
@@ -371,7 +372,7 @@ public class ConvertedModel {
*/
private static void reduceBatchDimensions(RankingExpression expression, ImportedMlModel model,
RankProfile profile, QueryProfileRegistry queryProfiles) {
- TypeContext<Reference> typeContext = profile.typeContext(queryProfiles);
+ MapEvaluationTypeContext typeContext = profile.typeContext(queryProfiles);
TensorType typeBeforeReducing = expression.getRoot().type(typeContext);
// Check generated functions for inputs to reduce
@@ -398,7 +399,7 @@ public class ConvertedModel {
}
private static ExpressionNode reduceBatchDimensionsAtInput(ExpressionNode node, ImportedMlModel model,
- TypeContext<Reference> typeContext) {
+ MapEvaluationTypeContext typeContext) {
if (node instanceof TensorFunctionNode) {
TensorFunction tensorFunction = ((TensorFunctionNode) node).function();
if (tensorFunction instanceof Rename) {
@@ -428,7 +429,7 @@ public class ConvertedModel {
return node;
}
- private static ExpressionNode reduceBatchDimensionExpression(TensorFunction function, TypeContext<Reference> context) {
+ private static ExpressionNode reduceBatchDimensionExpression(TensorFunction function, MapEvaluationTypeContext context) {
TensorFunction result = function;
TensorType type = function.type(context);
if (type.dimensions().size() > 1) {
@@ -440,6 +441,7 @@ public class ConvertedModel {
}
if (reduceDimensions.size() > 0) {
result = new Reduce(function, Reduce.Aggregator.sum, reduceDimensions);
+ context.forgetResolvedTypes(); // We changed types
}
}
return new TensorFunctionNode(result);
diff --git a/config-model/src/test/java/com/yahoo/searchdefinition/derived/GeminiTestCase.java b/config-model/src/test/java/com/yahoo/searchdefinition/derived/GeminiTestCase.java
index 4bc61f20d95..992e52a9e5b 100644
--- a/config-model/src/test/java/com/yahoo/searchdefinition/derived/GeminiTestCase.java
+++ b/config-model/src/test/java/com/yahoo/searchdefinition/derived/GeminiTestCase.java
@@ -23,7 +23,7 @@ public class GeminiTestCase extends AbstractExportingTestCase {
Map<String, String> ranking = removePartKeySuffixes(asMap(p.configProperties()));
assertEquals("attribute(right)", resolve(lookup("toplevel", ranking), ranking));
}
-
+
private Map<String, String> asMap(List<Pair<String, String>> properties) {
Map<String, String> map = new HashMap<>();
for (Pair<String, String> property : properties)
@@ -45,7 +45,7 @@ public class GeminiTestCase extends AbstractExportingTestCase {
}
/**
- * Recurively resolves references to other ranking expressions - rankingExpression(name) -
+ * Recursively resolves references to other ranking expressions - rankingExpression(name) -
* and replaces the reference by the expression
*/
private String resolve(String expression, Map<String, String> ranking) {
diff --git a/configdefinitions/src/vespa/fleetcontroller.def b/configdefinitions/src/vespa/fleetcontroller.def
index 04c9e3b7c73..62f3b6759c3 100644
--- a/configdefinitions/src/vespa/fleetcontroller.def
+++ b/configdefinitions/src/vespa/fleetcontroller.def
@@ -166,3 +166,15 @@ cluster_has_global_document_types bool default=false
## Bucket merges are considered complete when:
## ((buckets_total - buckets_pending) / buckets_total)) >= min_merge_completion_ratio
min_merge_completion_ratio double default=1.0
+
+## If enabled, cluster state transitions are performed as two distinct phases:
+##
+## 1) state bundle propagation and bucket info gathering phase
+## 2) state activation phase, which is not performed until all nodes have completed phase 1
+##
+## This is to enable read-only operations to pass through the system during phase 1
+## while nodes await phase 2. If this feature is disabled, nodes will implicitly do
+## phase 2 as part of phase 1 at their own leisure, which means that actual state
+## activation may happen at wildly different times throughout the cluster. The 2 phase
+## transition logic aims to minimize the window of time where active states diverge.
+enable_two_phase_cluster_state_transitions bool default=false
diff --git a/container-core/src/main/java/com/yahoo/restapi/Path.java b/container-core/src/main/java/com/yahoo/restapi/Path.java
index 3aa23fbc916..79f70168696 100644
--- a/container-core/src/main/java/com/yahoo/restapi/Path.java
+++ b/container-core/src/main/java/com/yahoo/restapi/Path.java
@@ -54,7 +54,7 @@ public class Path {
values.clear();
String[] specElements = pathSpec.split("/");
boolean matchPrefix = false;
- if (specElements[specElements.length-1].equals("{*}")) {
+ if (specElements.length > 1 && specElements[specElements.length-1].equals("{*}")) {
matchPrefix = true;
specElements = Arrays.copyOf(specElements, specElements.length-1);
}
diff --git a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/zone/ZoneRegistry.java b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/zone/ZoneRegistry.java
index b7303480701..d085d00baaa 100644
--- a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/zone/ZoneRegistry.java
+++ b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/zone/ZoneRegistry.java
@@ -40,9 +40,6 @@ public interface ZoneRegistry {
/** Returns all possible API endpoints of all known config servers and config server VIPs in the given zone */
List<URI> getConfigServerApiUris(ZoneId zoneId);
- /** Returns a URL with the logs for the given deployment, if logging is configured for its zone */
- default Optional<URI> getLogServerUri(DeploymentId deploymentId) { return Optional.empty(); };
-
/** Returns the time to live for deployments in the given zone, or empty if this is infinite */
Optional<Duration> getDeploymentTimeToLive(ZoneId zoneId);
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
index aacc61655fc..de7f12efcaf 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
@@ -70,9 +70,10 @@ public class Upgrader extends Maintainer {
cancelUpgradesOf(applications().with(UpgradePolicy.conservative).upgrading().failing().notUpgradingTo(conservativeTargets), reason);
// Schedule the right upgrades
- canaryTarget.ifPresent(target -> upgrade(applications().with(UpgradePolicy.canary), target));
- defaultTargets.forEach(target -> upgrade(applications().with(UpgradePolicy.defaultPolicy), target));
- conservativeTargets.forEach(target -> upgrade(applications().with(UpgradePolicy.conservative), target));
+ ApplicationList applications = applications();
+ canaryTarget.ifPresent(target -> upgrade(applications.with(UpgradePolicy.canary), target));
+ defaultTargets.forEach(target -> upgrade(applications.with(UpgradePolicy.defaultPolicy), target));
+ conservativeTargets.forEach(target -> upgrade(applications.with(UpgradePolicy.conservative), target));
}
/** Returns the target versions for given confidence, one per major version in the system */
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/proxy/ConfigServerRestExecutorImpl.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/proxy/ConfigServerRestExecutorImpl.java
index a75d0afbad0..50c70f3e55b 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/proxy/ConfigServerRestExecutorImpl.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/proxy/ConfigServerRestExecutorImpl.java
@@ -81,7 +81,7 @@ public class ConfigServerRestExecutorImpl implements ConfigServerRestExecutor {
// Make a local copy of the list as we want to manipulate it in case of ping problems.
List<URI> allServers = zoneRegistry.getConfigServerVipUri(zoneId)
// TODO: Use config server VIP for all zones that have one
- .filter(zone -> zoneId.region().value().startsWith("aws-") || zoneId.region().value().startsWith("cd-aws-"))
+ .filter(zone -> zoneId.region().value().startsWith("aws-") || zoneId.region().value().contains("-aws-"))
.map(Collections::singletonList)
.orElseGet(() -> new ArrayList<>(zoneRegistry.getConfigServerUris(zoneId)));
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java
index a0d807ac333..664a310c966 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java
@@ -553,10 +553,6 @@ public class ApplicationApiHandler extends LoggingRequestHandler {
.ifPresent(endpoints -> endpoints.forEach(endpoint -> serviceUrlArray.addString(endpoint.toString())));
response.setString("nodes", withPath("/zone/v2/" + deploymentId.zoneId().environment() + "/" + deploymentId.zoneId().region() + "/nodes/v2/node/?&recursive=true&application=" + deploymentId.applicationId().tenant() + "." + deploymentId.applicationId().application() + "." + deploymentId.applicationId().instance(), request.getUri()).toString());
-
- controller.zoneRegistry().getLogServerUri(deploymentId)
- .ifPresent(elkUrl -> response.setString("elkUrl", elkUrl.toString()));
-
response.setString("yamasUrl", monitoringSystemUri(deploymentId).toString());
response.setString("version", deployment.version().toFullString());
response.setString("revision", deployment.applicationVersion().id());
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/role/PathGroup.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/role/PathGroup.java
index 653c1d40684..ea54c23702d 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/role/PathGroup.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/role/PathGroup.java
@@ -46,7 +46,7 @@ public enum PathGroup {
"/application/v4/tenant/{tenant}/application/",
"/application/v4/tenant/{tenant}/application/{application}",
"/application/v4/tenant/{tenant}/application/{application}/deploying/{*}",
- "/application/v4/tenant/{tenant}/application/{application}/instance/{instance}/job/{job}/{*}",
+ "/application/v4/tenant/{tenant}/application/{application}/instance/{*}",
"/application/v4/tenant/{tenant}/application/{application}/environment/dev/{*}",
"/application/v4/tenant/{tenant}/application/{application}/environment/perf/{*}",
"/application/v4/tenant/{tenant}/application/{application}/environment/prod/region/{region}/instance/{instance}/global-rotation/override"),
@@ -62,7 +62,12 @@ public enum PathGroup {
/** Read-only paths providing information related to deployments */
deploymentStatus("/badge/v1/{*}",
"/deployment/v1/{*}",
- "/zone/v1/{*}");
+ "/zone/v1/{*}"),
+
+ /** Paths used by some dashboard */
+ dashboard("/",
+ "/d/{*}",
+ "/statuspage/v1/{*}");
final Set<String> pathSpecs;
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/versions/OsVersion.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/versions/OsVersion.java
index df3899b9b23..2671f30255e 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/versions/OsVersion.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/versions/OsVersion.java
@@ -5,6 +5,7 @@ import com.yahoo.component.Version;
import com.yahoo.config.provision.CloudName;
import org.jetbrains.annotations.NotNull;
+import java.util.Comparator;
import java.util.Objects;
/**
@@ -14,6 +15,9 @@ import java.util.Objects;
*/
public class OsVersion implements Comparable<OsVersion> {
+ private static final Comparator<OsVersion> comparator = Comparator.comparing(OsVersion::cloud)
+ .thenComparing(OsVersion::version);
+
private final Version version;
private final CloudName cloud;
@@ -52,12 +56,8 @@ public class OsVersion implements Comparable<OsVersion> {
}
@Override
- public int compareTo(@NotNull OsVersion o) {
- int cloudCmp = cloud.compareTo(o.cloud());
- if (cloudCmp == 0) { // Same cloud, sort by version
- return version.compareTo(o.version());
- }
- return cloudCmp;
+ public int compareTo(@NotNull OsVersion that) {
+ return comparator.compare(this, that);
}
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/ZoneRegistryMock.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/ZoneRegistryMock.java
index 02dadc300b3..37261b0fdc4 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/ZoneRegistryMock.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/ZoneRegistryMock.java
@@ -160,25 +160,6 @@ public class ZoneRegistryMock extends AbstractComponent implements ZoneRegistry
}
@Override
- public Optional<URI> getLogServerUri(DeploymentId deploymentId) {
- if ( ! hasZone(deploymentId.zoneId()))
- return Optional.empty();
-
- String kibanaQuery = "/#/discover?_g=()&_a=(columns:!(_source)," +
- "index:'logstash-*',interval:auto," +
- "query:(query_string:(analyze_wildcard:!t,query:'" +
- "HV-tenant:%22" + deploymentId.applicationId().tenant().value() + "%22%20" +
- "AND%20HV-application:%22" + deploymentId.applicationId().application().value() + "%22%20" +
- "AND%20HV-region:%22" + deploymentId.zoneId().region().value() + "%22%20" +
- "AND%20HV-instance:%22" + deploymentId.applicationId().instance().value() + "%22%20" +
- "AND%20HV-environment:%22" + deploymentId.zoneId().environment().value() + "%22'))," +
- "sort:!('@timestamp',desc))";
-
- URI kibanaPath = URI.create(kibanaQuery);
- return Optional.of(URI.create(String.format("http://log.%s.test", deploymentId.zoneId().value())).resolve(kibanaPath));
- }
-
- @Override
public Optional<Duration> getDeploymentTimeToLive(ZoneId zoneId) {
return Optional.ofNullable(deploymentTimeToLive.get(zoneId));
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment.json
index 00177bf1afc..ac1797986fc 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment.json
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment.json
@@ -7,7 +7,6 @@
"http://alias-endpoint.vespa.yahooapis.com:4080"
],
"nodes": "http://localhost:8080/zone/v2/prod/us-central-1/nodes/v2/node/%3F&recursive=true&application=tenant1.application1.default",
- "elkUrl": "http://log.prod.us-central-1.test/#/discover?_g=()&_a=(columns:!(_source),index:'logstash-*',interval:auto,query:(query_string:(analyze_wildcard:!t,query:'HV-tenant:%22tenant1%22%20AND%20HV-application:%22application1%22%20AND%20HV-region:%22us-central-1%22%20AND%20HV-instance:%22default%22%20AND%20HV-environment:%22prod%22')),sort:!('@timestamp',desc))",
"yamasUrl": "http://monitoring-system.test/?environment=prod&region=us-central-1&application=tenant1.application1",
"version": "(ignore)",
"revision": "(ignore)",
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/prod-us-central-1.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/prod-us-central-1.json
index c7693c3d0d4..a3380d823f3 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/prod-us-central-1.json
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/prod-us-central-1.json
@@ -13,7 +13,6 @@
"http://alias-endpoint.vespa.yahooapis.com:4080"
],
"nodes": "http://localhost:8080/zone/v2/prod/us-central-1/nodes/v2/node/%3F&recursive=true&application=tenant1.application1.default",
- "elkUrl": "http://log.prod.us-central-1.test/#/discover?_g=()&_a=(columns:!(_source),index:'logstash-*',interval:auto,query:(query_string:(analyze_wildcard:!t,query:'HV-tenant:%22tenant1%22%20AND%20HV-application:%22application1%22%20AND%20HV-region:%22us-central-1%22%20AND%20HV-instance:%22default%22%20AND%20HV-environment:%22prod%22')),sort:!('@timestamp',desc))",
"yamasUrl": "http://monitoring-system.test/?environment=prod&region=us-central-1&application=tenant1.application1",
"version": "(ignore)",
"revision": "(ignore)",
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/user/UserApiTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/user/UserApiTest.java
index d7fd38b5f41..bb488d0af22 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/user/UserApiTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/user/UserApiTest.java
@@ -16,7 +16,7 @@ public class UserApiTest extends ControllerContainerTest {
ContainerControllerTester tester = new ContainerControllerTester(container, responseFiles);
tester.assertResponse(authenticatedRequest("http://localhost:8080/user/v1/"),
- "{\"error-code\":\"NOT_FOUND\",\"message\":\"No 'GET' handler at '/user/v1/'\"}", 404);
+ "{\n \"code\" : 403,\n \"message\" : \"Access denied\"\n" + "}", 403);
}
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/role/PathGroupTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/role/PathGroupTest.java
index b7c751638c8..d110ff4c2fe 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/role/PathGroupTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/role/PathGroupTest.java
@@ -39,8 +39,11 @@ public class PathGroupTest {
String[] parts2 = path2.split("/");
int end = Math.min(parts1.length, parts2.length);
- if (end < parts1.length && ! parts2[end - 1].equals("{*}") && ! parts1[end].equals("{*}")) continue;
- if (end < parts2.length && ! parts1[end - 1].equals("{*}") && ! parts2[end].equals("{*}")) continue;
+ // If one path has more parts than the other ...
+ // and the other doesn't end with a wildcard matcher ...
+ // and the longest one isn't just one part longer, which is a wildcard ...
+ if (end < parts1.length && (end == 0 || ! parts2[end - 1].equals("{*}")) && ! parts1[end].equals("{*}")) continue;
+ if (end < parts2.length && (end == 0 || ! parts1[end - 1].equals("{*}")) && ! parts2[end].equals("{*}")) continue;
int i;
for (i = 0; i < end; i++)
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/role/RoleMembershipTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/role/RoleMembershipTest.java
index bc810fdb5c5..d4e673a02ae 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/role/RoleMembershipTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/role/RoleMembershipTest.java
@@ -82,6 +82,9 @@ public class RoleMembershipTest {
assertTrue(roles.allows(Action.create, "/application/v4/tenant/t1/application/a1/jobreport"));
assertTrue(roles.allows(Action.update, "/application/v4/tenant/t1/application/a1"));
assertTrue("Global read access", roles.allows(Action.read, "/controller/v1/foo"));
+ assertTrue("Dashboard read access", roles.allows(Action.read, "/"));
+ assertTrue("Dashboard read access", roles.allows(Action.read, "/d/nodes"));
+ assertTrue("Dashboard read access", roles.allows(Action.read, "/statuspage/v1/incidents"));
}
}
diff --git a/defaults/abi-spec.json b/defaults/abi-spec.json
index 95dc2e40353..ee76627a61c 100644
--- a/defaults/abi-spec.json
+++ b/defaults/abi-spec.json
@@ -8,6 +8,7 @@
"methods": [
"public java.lang.String vespaUser()",
"public java.lang.String vespaHostname()",
+ "public java.lang.String temporaryApplicationStorage()",
"public java.lang.String vespaHome()",
"public java.lang.String underVespaHome(java.lang.String)",
"public int vespaWebServicePort()",
diff --git a/defaults/src/main/java/com/yahoo/vespa/defaults/Defaults.java b/defaults/src/main/java/com/yahoo/vespa/defaults/Defaults.java
index 0fce5d654fb..f1b7e38986f 100644
--- a/defaults/src/main/java/com/yahoo/vespa/defaults/Defaults.java
+++ b/defaults/src/main/java/com/yahoo/vespa/defaults/Defaults.java
@@ -1,12 +1,8 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.defaults;
-import java.io.BufferedReader;
-import java.io.InputStreamReader;
-import java.net.InetAddress;
-import java.nio.charset.StandardCharsets;
-import java.util.logging.Logger;
import java.util.Optional;
+import java.util.logging.Logger;
@@ -25,6 +21,7 @@ public class Defaults {
private final String vespaHome;
private final String vespaUser;
private final String vespaHost;
+ private final String temporaryApplicationStorage;
private final int vespaWebServicePort;
private final int vespaPortBase;
private final int vespaPortConfigServerRpc;
@@ -35,6 +32,7 @@ public class Defaults {
vespaHome = findVespaHome("/opt/vespa");
vespaUser = findVespaUser("vespa");
vespaHost = findVespaHostname("localhost");
+ temporaryApplicationStorage = underVespaHome("var/vespa/application");
vespaWebServicePort = findWebServicePort(8080);
vespaPortBase = findVespaPortBase(19000);
vespaPortConfigServerRpc = findConfigServerPort(vespaPortBase + 70);
@@ -116,6 +114,15 @@ public class Defaults {
public String vespaHostname() { return vespaHost; }
/**
+ * Returns the path where a Vespa application can store arbitrary files. This should only be used for temporary
+ * files as there are no availability guarantees for files stored here. The application must be able to recreate
+ * required files on its own (e.g. by downloading them from a remote source) if missing.
+ *
+ * @return the temporary storage path
+ */
+ public String temporaryApplicationStorage() { return temporaryApplicationStorage; }
+
+ /**
* Returns the path to the root under which Vespa should read and write files.
* Will not end with a "/".
* @return the vespa home directory
diff --git a/defaults/src/test/java/com/yahoo/vespa/defaults/DefaultsTestCase.java b/defaults/src/test/java/com/yahoo/vespa/defaults/DefaultsTestCase.java
index 07d3c39fc9c..88f4ad6f2fd 100644
--- a/defaults/src/test/java/com/yahoo/vespa/defaults/DefaultsTestCase.java
+++ b/defaults/src/test/java/com/yahoo/vespa/defaults/DefaultsTestCase.java
@@ -1,8 +1,10 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.defaults;
+import org.junit.Ignore;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
/**
* @author arnej27959
@@ -25,14 +27,20 @@ public class DefaultsTestCase {
@Test
public void testPortsArePositive() {
Defaults d = Defaults.getDefaults();
- assertEquals(true, d.vespaPortBase() > 0);
- assertEquals(true, d.vespaWebServicePort() > 0);
- assertEquals(true, d.vespaConfigServerRpcPort() > 0);
- assertEquals(true, d.vespaConfigServerHttpPort() > 0);
- assertEquals(true, d.vespaConfigProxyRpcPort() > 0);
+ assertTrue(d.vespaPortBase() > 0);
+ assertTrue(d.vespaWebServicePort() > 0);
+ assertTrue(d.vespaConfigServerRpcPort() > 0);
+ assertTrue(d.vespaConfigServerHttpPort() > 0);
+ assertTrue(d.vespaConfigProxyRpcPort() > 0);
}
@Test
+ public void testTemporaryApplicationStorage() {
+ assertEquals("/opt/vespa/var/vespa/application", Defaults.getDefaults().temporaryApplicationStorage());
+ }
+
+ @Test
+ @Ignore // This is run manually for human inspection. Contains no assertions
public void dumpAllVars() {
Defaults d = Defaults.getDefaults();
System.out.println("vespa user = '" + d.vespaUser() + "'");
diff --git a/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/DockerImpl.java b/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/DockerImpl.java
index 801b9b03bb4..e57f61ce5f4 100644
--- a/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/DockerImpl.java
+++ b/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/DockerImpl.java
@@ -276,17 +276,15 @@ public class DockerImpl implements Docker {
private Stream<Container> asContainer(String container) {
return inspectContainerCmd(container)
- .map(response ->
- new Container(
- response.getConfig().getHostName(),
- DockerImage.fromString(response.getConfig().getImage()),
- containerResourcesFromHostConfig(response.getHostConfig()),
- new ContainerName(decode(response.getName())),
- Container.State.valueOf(response.getState().getStatus().toUpperCase()),
- response.getState().getPid()
- ))
- .map(Stream::of)
- .orElse(Stream.empty());
+ .map(response -> new Container(
+ response.getConfig().getHostName(),
+ DockerImage.fromString(response.getConfig().getImage()),
+ containerResourcesFromHostConfig(response.getHostConfig()),
+ new ContainerName(decode(response.getName())),
+ Container.State.valueOf(response.getState().getStatus().toUpperCase()),
+ response.getState().getPid()
+ ))
+ .stream();
}
private static ContainerResources containerResourcesFromHostConfig(HostConfig hostConfig) {
diff --git a/document/src/vespa/document/select/operator.cpp b/document/src/vespa/document/select/operator.cpp
index 1b97a375a5f..36113844d88 100644
--- a/document/src/vespa/document/select/operator.cpp
+++ b/document/src/vespa/document/select/operator.cpp
@@ -1,7 +1,7 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "operator.h"
-#include <vespa/vespalib/util/regexp.h>
+#include <regex>
#include <vespa/vespalib/stllike/asciistream.h>
#include <vespa/vespalib/stllike/hash_map.hpp>
#include <cassert>
@@ -127,8 +127,12 @@ RegexOperator::match(const vespalib::string& val, vespalib::stringref expr) cons
{
// Should we catch this in parsing?
if (expr.size() == 0) return ResultList(Result::True);
- vespalib::Regexp expression(expr);
- return ResultList(Result::get(expression.match(val)));
+ try {
+ std::basic_regex<char> expression(expr.data(), expr.size());
+ return ResultList(Result::get(std::regex_search(val.c_str(), val.c_str() + val.size(), expression)));
+ } catch (std::regex_error &) {
+ return ResultList(Result::False);
+ }
}
const RegexOperator RegexOperator::REGEX("=~");
diff --git a/flags/src/main/java/com/yahoo/vespa/flags/Flags.java b/flags/src/main/java/com/yahoo/vespa/flags/Flags.java
index bb6b346abd4..5fa8e39ac81 100644
--- a/flags/src/main/java/com/yahoo/vespa/flags/Flags.java
+++ b/flags/src/main/java/com/yahoo/vespa/flags/Flags.java
@@ -64,11 +64,6 @@ public class Flags {
"Whether to use a dedicated node for the logserver.", "Takes effect at redeployment",
APPLICATION_ID);
- public static final UnboundBooleanFlag USE_DOCKER_91 = defineFeatureFlag(
- "use-docker-91", false,
- "Whether to upgrade to Docker version 1.13.1-91.git07f3374", "Takes effect after restart of host admin",
- HOSTNAME);
-
public static final UnboundDoubleFlag CONTAINER_CPU_CAP = defineDoubleFlag(
"container-cpu-cap", 0,
"Hard limit on how many CPUs a container may use. This value is multiplied by CPU allocated to node, so " +
diff --git a/logd/CMakeLists.txt b/logd/CMakeLists.txt
index 85aebac365b..9a5fdf32841 100644
--- a/logd/CMakeLists.txt
+++ b/logd/CMakeLists.txt
@@ -15,6 +15,7 @@ vespa_define_module(
TESTS
src/tests/legacy_forwarder
+ src/tests/proto_converter
src/tests/rotate
)
diff --git a/logd/src/logd/CMakeLists.txt b/logd/src/logd/CMakeLists.txt
index a3ff813ad96..baf52f1d5d8 100644
--- a/logd/src/logd/CMakeLists.txt
+++ b/logd/src/logd/CMakeLists.txt
@@ -15,11 +15,15 @@ vespa_add_library(logd STATIC
conn.cpp
legacy_forwarder.cpp
metrics.cpp
+ proto_converter.cpp
state_reporter.cpp
watcher.cpp
${logd_PROTOBUF_SRCS}
DEPENDS
)
+
vespa_generate_config(logd ../main/resources/configdefinitions/logd.def)
install_config_definition(../main/resources/configdefinitions/logd.def cloud.config.log.logd.def)
+
+vespa_add_target_package_dependency(logd Protobuf)
diff --git a/logd/src/logd/legacy_forwarder.cpp b/logd/src/logd/legacy_forwarder.cpp
index b512bab7fb6..b8b93a03530 100644
--- a/logd/src/logd/legacy_forwarder.cpp
+++ b/logd/src/logd/legacy_forwarder.cpp
@@ -3,6 +3,8 @@
#include "exceptions.h"
#include "legacy_forwarder.h"
#include "metrics.h"
+#include <vespa/log/log_message.h>
+#include <vespa/log/exceptions.h>
#include <vespa/vespalib/component/vtag.h>
#include <vespa/vespalib/locale/c.h>
#include <unistd.h>
@@ -11,6 +13,10 @@
LOG_SETUP("");
using LogLevel = ns_log::Logger::LogLevel;
+using ns_log::BadLogLineException;
+using ns_log::LogMessage;
+using ns_log::Logger;
+using LogLevel = Logger::LogLevel;
namespace logdemon {
@@ -18,7 +24,6 @@ LegacyForwarder::LegacyForwarder(Metrics &metrics)
: _logserverfd(-1),
_metrics(metrics),
_forwardMap(),
- _levelparser(),
_badLines(0)
{}
LegacyForwarder::~LegacyForwarder() = default;
@@ -70,120 +75,30 @@ LegacyForwarder::forwardLine(const char *line, const char *eol)
bool
LegacyForwarder::parseline(const char *linestart, const char *lineend)
{
- int llength = lineend - linestart;
-
- const char *fieldstart = linestart;
- // time
- const char *tab = strchr(fieldstart, '\t');
- if (tab == nullptr || tab == fieldstart) {
- LOG(spam, "bad logline no 1. tab: %.*s", llength, linestart);
- ++_badLines;
- return false;
- }
- char *eod;
- double logtime = vespalib::locale::c::strtod(fieldstart, &eod);
- if (eod != tab) {
- int fflen = tab - linestart;
- LOG(spam, "bad logline first field not strtod parsable: %.*s", fflen, linestart);
- ++_badLines;
- return false;
- }
- time_t now = time(nullptr);
- if (logtime - 864000 > now) {
- int fflen = tab - linestart;
- LOG(warning, "bad logline, time %.*s > 10 days in the future", fflen, linestart);
- ++_badLines;
- return false;
- }
- if (logtime + 8640000 < now) {
- int fflen = tab - linestart;
- LOG(warning, "bad logline, time %.*s > 100 days in the past", fflen, linestart);
+ LogMessage message;
+ try {
+ message.parse_log_line(std::string_view(linestart, lineend - linestart));
+ } catch (BadLogLineException &e) {
+ LOG(spam, "bad logline: %s", e.what());
++_badLines;
return false;
}
- // hostname
- fieldstart = tab + 1;
- tab = strchr(fieldstart, '\t');
- if (tab == nullptr) {
- LOG(spam, "bad logline no 2. tab: %.*s", llength, linestart);
- ++_badLines;
- return false;
- }
-
- // pid
- fieldstart = tab + 1;
- tab = strchr(fieldstart, '\t');
- if (tab == nullptr || tab == fieldstart) {
- LOG(spam, "bad logline no 3. tab: %.*s", llength, linestart);
- return false;
- }
-
- // service
- fieldstart = tab + 1;
- tab = strchr(fieldstart, '\t');
- if (tab == nullptr) {
- LOG(spam, "bad logline no 4. tab: %.*s", llength, linestart);
- ++_badLines;
- return false;
- }
- if (tab == fieldstart) {
- LOG(spam, "empty service in logline: %.*s", llength, linestart);
- }
- std::string service(fieldstart, tab-fieldstart);
-
- // component
- fieldstart = tab + 1;
- tab = strchr(fieldstart, '\t');
- if (tab == nullptr || tab == fieldstart) {
- LOG(spam, "bad logline no 5. tab: %.*s", llength, linestart);
- ++_badLines;
- return false;
- }
- std::string component(fieldstart, tab-fieldstart);
-
- // level
- fieldstart = tab + 1;
- tab = strchr(fieldstart, '\t');
- if (tab == nullptr || tab == fieldstart) {
- LOG(spam, "bad logline no 6. tab: %.*s", llength, linestart);
- ++_badLines;
- return false;
- }
- std::string level(fieldstart, tab-fieldstart);
- LogLevel l = _levelparser.parseLevel(level.c_str());
-
- // rest is freeform message, must be on this line:
- if (tab > lineend) {
- LOG(spam, "bad logline last tab after end: %.*s", llength, linestart);
- ++_badLines;
- return false;
+ std::string logLevelName;
+ if (message.level() >= LogLevel::NUM_LOGLEVELS) {
+ logLevelName = "unknown";
+ } else {
+ logLevelName = Logger::logLevelNames[message.level()];
}
-
- _metrics.countLine(level, service);
+ _metrics.countLine(logLevelName, message.service());
// Check overrides
- ForwardMap::iterator found = _forwardMap.find(l);
+ ForwardMap::iterator found = _forwardMap.find(message.level());
if (found != _forwardMap.end()) {
return found->second;
}
return false; // Unknown log level
}
-LogLevel
-LevelParser::parseLevel(const char *level)
-{
- using ns_log::Logger;
-
- LogLevel l = Logger::parseLevel(level);
- if (l >= 0 && l <= Logger::NUM_LOGLEVELS) {
- return l;
- }
- if (_seenLevelMap.find(level) == _seenLevelMap.end()) {
- LOG(warning, "unknown level '%s'", level);
- _seenLevelMap.insert(level);
- }
- return Logger::fatal;
-}
} // namespace
diff --git a/logd/src/logd/legacy_forwarder.h b/logd/src/logd/legacy_forwarder.h
index da8dbcc82ab..81a93ce1d50 100644
--- a/logd/src/logd/legacy_forwarder.h
+++ b/logd/src/logd/legacy_forwarder.h
@@ -8,21 +8,11 @@
namespace logdemon {
-using SeenMap = std::unordered_set<std::string>;
// Mapping saying if a level should be forwarded or not
using ForwardMap = std::map<ns_log::Logger::LogLevel, bool>;
struct Metrics;
-class LevelParser
-{
-private:
- SeenMap _seenLevelMap;
-public:
- ns_log::Logger::LogLevel parseLevel(const char *level);
- LevelParser() : _seenLevelMap() {}
-};
-
/**
* Class used to forward log lines to the logserver via a one-way text protocol.
*/
@@ -31,7 +21,6 @@ private:
int _logserverfd;
Metrics &_metrics;
ForwardMap _forwardMap;
- LevelParser _levelparser;
int _badLines;
const char *copystr(const char *b, const char *e) {
int len = e - b;
diff --git a/logd/src/logd/log_protocol_proto.h b/logd/src/logd/log_protocol_proto.h
new file mode 100644
index 00000000000..a8d5e4aa208
--- /dev/null
+++ b/logd/src/logd/log_protocol_proto.h
@@ -0,0 +1,11 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsuggest-override"
+
+#include "log_protocol.pb.h"
+
+#pragma GCC diagnostic pop
+
diff --git a/logd/src/logd/proto_converter.cpp b/logd/src/logd/proto_converter.cpp
new file mode 100644
index 00000000000..b3facd4ef4a
--- /dev/null
+++ b/logd/src/logd/proto_converter.cpp
@@ -0,0 +1,65 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "proto_converter.h"
+
+using ns_log::LogMessage;
+using ns_log::Logger;
+
+namespace logdemon {
+
+void
+ProtoConverter::log_messages_to_proto(const std::vector<LogMessage>& messages, ProtoLogRequest& proto)
+{
+ for (const auto& message : messages) {
+ auto* proto_message = proto.add_log_messages();
+ log_message_to_proto(message, *proto_message);
+ }
+}
+
+namespace {
+
+using ProtoLogLevel = ::logserver::protocol::protobuf::LogMessage_Level;
+
+ProtoLogLevel
+convert_level(const Logger::LogLevel& level)
+{
+ switch (level) {
+ case Logger::fatal:
+ return ProtoLogLevel::LogMessage_Level_FATAL;
+ case Logger::error:
+ return ProtoLogLevel::LogMessage_Level_ERROR;
+ case Logger::warning:
+ return ProtoLogLevel::LogMessage_Level_WARNING;
+ case Logger::config:
+ return ProtoLogLevel::LogMessage_Level_CONFIG;
+ case Logger::info:
+ return ProtoLogLevel::LogMessage_Level_INFO;
+ case Logger::event:
+ return ProtoLogLevel::LogMessage_Level_EVENT;
+ case Logger::debug:
+ return ProtoLogLevel::LogMessage_Level_DEBUG;
+ case Logger::spam:
+ return ProtoLogLevel::LogMessage_Level_SPAM;
+ case Logger::NUM_LOGLEVELS:
+ return ProtoLogLevel::LogMessage_Level_UNKNOWN;
+ default:
+ return ProtoLogLevel::LogMessage_Level_UNKNOWN;
+ }
+}
+
+}
+
+void
+ProtoConverter::log_message_to_proto(const LogMessage& message, ProtoLogMessage& proto)
+{
+ proto.set_time_nanos(message.time_nanos());
+ proto.set_hostname(message.hostname());
+ proto.set_process_id(message.process_id());
+ proto.set_thread_id(message.thread_id());
+ proto.set_service(message.service());
+ proto.set_component(message.component());
+ proto.set_level(convert_level(message.level()));
+ proto.set_payload(message.payload());
+}
+
+}
diff --git a/logd/src/logd/proto_converter.h b/logd/src/logd/proto_converter.h
new file mode 100644
index 00000000000..688648b99de
--- /dev/null
+++ b/logd/src/logd/proto_converter.h
@@ -0,0 +1,20 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "log_protocol_proto.h"
+#include <vespa/log/log_message.h>
+#include <vector>
+
+namespace logdemon {
+
+/**
+ * Contains functions to convert log messages to protobuf objects.
+ */
+struct ProtoConverter {
+ using ProtoLogRequest = logserver::protocol::protobuf::LogRequest;
+ using ProtoLogMessage = logserver::protocol::protobuf::LogMessage;
+
+ static void log_messages_to_proto(const std::vector<ns_log::LogMessage>& messages, ProtoLogRequest& proto);
+ static void log_message_to_proto(const ns_log::LogMessage& message, ProtoLogMessage& proto);
+};
+
+}
diff --git a/logd/src/tests/proto_converter/CMakeLists.txt b/logd/src/tests/proto_converter/CMakeLists.txt
new file mode 100644
index 00000000000..5ca048ecd4e
--- /dev/null
+++ b/logd/src/tests/proto_converter/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(logd_proto_converter_test_app TEST
+ SOURCES
+ proto_converter_test.cpp
+ DEPENDS
+ logd
+ gtest
+)
+vespa_add_test(NAME logd_proto_converter_test_app COMMAND logd_proto_converter_test_app)
diff --git a/logd/src/tests/proto_converter/proto_converter_test.cpp b/logd/src/tests/proto_converter/proto_converter_test.cpp
new file mode 100644
index 00000000000..aa0b00e34d6
--- /dev/null
+++ b/logd/src/tests/proto_converter/proto_converter_test.cpp
@@ -0,0 +1,88 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <logd/proto_converter.h>
+#include <vespa/vespalib/gtest/gtest.h>
+
+using ns_log::Logger;
+using ns_log::LogMessage;
+
+using Converter = logdemon::ProtoConverter;
+using ProtoLogLevel = logserver::protocol::protobuf::LogMessage_Level;
+
+struct LogMessageTest : public ::testing::Test {
+ LogMessage message;
+ Converter::ProtoLogMessage proto;
+ void convert() {
+ Converter::log_message_to_proto(message, proto);
+ }
+ void expect_log_level_converted(ProtoLogLevel proto_level, Logger::LogLevel message_level) {
+ message = LogMessage(1, "", 1, 1, "", "", message_level, "");
+ convert();
+ EXPECT_EQ(proto_level, proto.level());
+ }
+};
+
+void
+expect_proto_log_message_equal(int64_t exp_time_nanos,
+ const std::string& exp_hostname,
+ int32_t exp_process_id,
+ int32_t exp_thread_id,
+ const std::string& exp_service,
+ const std::string& exp_component,
+ ProtoLogLevel exp_level,
+ const std::string& exp_payload,
+ const Converter::ProtoLogMessage& proto)
+{
+ EXPECT_EQ(exp_time_nanos, proto.time_nanos());
+ EXPECT_EQ(exp_hostname, proto.hostname());
+ EXPECT_EQ(exp_process_id, proto.process_id());
+ EXPECT_EQ(exp_thread_id, proto.thread_id());
+ EXPECT_EQ(exp_service, proto.service());
+ EXPECT_EQ(exp_component, proto.component());
+ EXPECT_EQ(exp_level, proto.level());
+ EXPECT_EQ(exp_payload, proto.payload());
+}
+
+TEST_F(LogMessageTest, log_message_is_converted)
+{
+ message = LogMessage(12345, "foo_host", 3, 5, "foo_service", "foo_component", Logger::info, "foo_payload");
+ convert();
+ expect_proto_log_message_equal(12345, "foo_host", 3, 5, "foo_service", "foo_component",
+ ProtoLogLevel::LogMessage_Level_INFO, "foo_payload", proto);
+}
+
+TEST_F(LogMessageTest, log_levels_are_converted)
+{
+ expect_log_level_converted(ProtoLogLevel::LogMessage_Level_FATAL, Logger::fatal);
+ expect_log_level_converted(ProtoLogLevel::LogMessage_Level_ERROR, Logger::error);
+ expect_log_level_converted(ProtoLogLevel::LogMessage_Level_WARNING, Logger::warning);
+ expect_log_level_converted(ProtoLogLevel::LogMessage_Level_CONFIG, Logger::config);
+ expect_log_level_converted(ProtoLogLevel::LogMessage_Level_INFO, Logger::info);
+ expect_log_level_converted(ProtoLogLevel::LogMessage_Level_EVENT, Logger::event);
+ expect_log_level_converted(ProtoLogLevel::LogMessage_Level_DEBUG, Logger::debug);
+ expect_log_level_converted(ProtoLogLevel::LogMessage_Level_SPAM, Logger::spam);
+ expect_log_level_converted(ProtoLogLevel::LogMessage_Level_UNKNOWN, Logger::NUM_LOGLEVELS);
+}
+
+struct LogRequestTest : public ::testing::Test {
+ std::vector<LogMessage> messages;
+ Converter::ProtoLogRequest proto;
+ void convert() {
+ Converter::log_messages_to_proto(messages, proto);
+ }
+};
+
+TEST_F(LogRequestTest, log_messages_are_converted_to_request)
+{
+ messages.emplace_back(12345, "foo_host", 3, 5, "foo_service", "foo_component", Logger::info, "foo_payload");
+ messages.emplace_back(54321, "bar_host", 7, 9, "bar_service", "bar_component", Logger::event, "bar_payload");
+ convert();
+ EXPECT_EQ(2, proto.log_messages_size());
+ expect_proto_log_message_equal(12345, "foo_host", 3, 5, "foo_service", "foo_component",
+ ProtoLogLevel::LogMessage_Level_INFO, "foo_payload", proto.log_messages(0));
+ expect_proto_log_message_equal(54321, "bar_host", 7, 9, "bar_service", "bar_component",
+ ProtoLogLevel::LogMessage_Level_EVENT, "bar_payload", proto.log_messages(1));
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
+
diff --git a/metrics/src/main/java/com/yahoo/metrics/Metric.java b/metrics/src/main/java/com/yahoo/metrics/Metric.java
index f1f389a4dfe..ad7ffc971f6 100644
--- a/metrics/src/main/java/com/yahoo/metrics/Metric.java
+++ b/metrics/src/main/java/com/yahoo/metrics/Metric.java
@@ -65,7 +65,7 @@ public abstract class Metric {
}
public List<String> getPathVector() {
- List<String> result = new ArrayList<String>();
+ List<String> result = new ArrayList<>();
result.add(getName());
MetricSet owner = this.owner;
while (owner != null) {
diff --git a/metrics/src/main/java/com/yahoo/metrics/MetricSet.java b/metrics/src/main/java/com/yahoo/metrics/MetricSet.java
index 5305002782d..a9fea1c65a4 100644
--- a/metrics/src/main/java/com/yahoo/metrics/MetricSet.java
+++ b/metrics/src/main/java/com/yahoo/metrics/MetricSet.java
@@ -6,11 +6,11 @@ import com.yahoo.text.XMLWriter;
import java.util.*;
import java.util.logging.Logger;
-public abstract class MetricSet extends Metric
-{
+public abstract class MetricSet extends Metric {
+
private static Logger log = Logger.getLogger(MetricSet.class.getName());
- List<Metric> metricOrder = new ArrayList<Metric>(); // Keep added order for reporting
+ List<Metric> metricOrder = new ArrayList<>(); // Keep added order for reporting
boolean registrationAltered; // Set to true if metrics have been
// registered/unregistered since last time
// it was reset
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java
index 52d55dad087..e34e97dc2aa 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/docker/DockerOperationsImpl.java
@@ -150,7 +150,7 @@ public class DockerOperationsImpl implements DockerOperations {
"ff02::1\tip6-allnodes\n" +
"ff02::2\tip6-allrouters\n" +
ipV6Local.getHostAddress() + '\t' + hostname + '\n');
- ipV4Local.ifPresent(ipv4 -> etcHosts.append(ipv4.getHostAddress() + '\t' + hostname + '\n'));
+ ipV4Local.ifPresent(ipv4 -> etcHosts.append(ipv4.getHostAddress()).append('\t').append(hostname).append('\n'));
containerData.addFile(Paths.get("/etc/hosts"), etcHosts.toString());
}
@@ -199,16 +199,17 @@ public class DockerOperationsImpl implements DockerOperations {
@Override
public ProcessResult executeCommandInNetworkNamespace(NodeAgentContext context, String... command) {
- final int containerPid = docker.getContainer(context.containerName())
+ int containerPid = docker.getContainer(context.containerName())
.filter(container -> container.state.isRunning())
.orElseThrow(() -> new RuntimeException(
"Found no running container named " + context.containerName().asString()))
.pid;
- final String[] wrappedCommand = Stream.concat(
- Stream.of("nsenter", String.format("--net=/proc/%d/ns/net", containerPid), "--"),
- Stream.of(command))
- .toArray(String[]::new);
+ String[] wrappedCommand = Stream.concat(Stream.of("nsenter",
+ String.format("--net=/proc/%d/ns/net", containerPid),
+ "--"),
+ Stream.of(command))
+ .toArray(String[]::new);
try {
Pair<Integer, String> result = processExecuter.exec(wrappedCommand);
@@ -267,7 +268,7 @@ public class DockerOperationsImpl implements DockerOperations {
}
private static void addMounts(NodeAgentContext context, Docker.CreateContainerCommand command) {
- final Path varLibSia = Paths.get("/var/lib/sia");
+ Path varLibSia = Paths.get("/var/lib/sia");
// Paths unique to each container
List<Path> paths = new ArrayList<>(Arrays.asList(
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
index 60fdb242cfd..0f150ad9065 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
@@ -225,8 +225,6 @@ public class StorageMaintainer {
/** Deletes old log files for vespa, nginx, logstash, etc. */
public void removeOldFilesFromNode(NodeAgentContext context) {
Path[] logPaths = {
- context.pathInNodeUnderVespaHome("logs/elasticsearch2"),
- context.pathInNodeUnderVespaHome("logs/logstash2"),
context.pathInNodeUnderVespaHome("logs/daemontools_y"),
context.pathInNodeUnderVespaHome("logs/nginx"),
context.pathInNodeUnderVespaHome("logs/vespa")
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
index e9eacddb060..5b5d13ca346 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
@@ -154,9 +154,9 @@ public class NodeAdminImpl implements NodeAdmin {
public void stopNodeAgentServices(List<String> hostnames) {
// Each container may spend 1-1:30 minutes stopping
hostnames.parallelStream()
- .filter(nodeAgentWithSchedulerByHostname::containsKey)
- .map(nodeAgentWithSchedulerByHostname::get)
- .forEach(NodeAgentWithScheduler::stopForHostSuspension);
+ .filter(nodeAgentWithSchedulerByHostname::containsKey)
+ .map(nodeAgentWithSchedulerByHostname::get)
+ .forEach(NodeAgentWithScheduler::stopForHostSuspension);
}
@Override
@@ -171,8 +171,8 @@ public class NodeAdminImpl implements NodeAdmin {
}
// Set-difference. Returns minuend minus subtrahend.
- private static <T> Set<T> diff(final Set<T> minuend, final Set<T> subtrahend) {
- final HashSet<T> result = new HashSet<>(minuend);
+ private static <T> Set<T> diff(Set<T> minuend, Set<T> subtrahend) {
+ var result = new HashSet<>(minuend);
result.removeAll(subtrahend);
return result;
}
diff --git a/pom.xml b/pom.xml
index 8c051381efe..7efcab645e1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -127,9 +127,10 @@
<module>vespaclient-java</module>
<module>vespa-athenz</module>
<module>vespa-documentgen-plugin</module>
- <module>vespa_feed_perf</module>
<module>vespa-hadoop</module>
<module>vespa-http-client</module>
+ <module>vespa-testrunner-components</module>
+ <module>vespa_feed_perf</module>
<module>vespa_jersey2</module>
<module>vespajlib</module>
<module>vespalog</module>
diff --git a/searchcore/src/tests/proton/docsummary/docsummary.cpp b/searchcore/src/tests/proton/docsummary/docsummary.cpp
index fb8674b5255..a3acbfdbfe0 100644
--- a/searchcore/src/tests/proton/docsummary/docsummary.cpp
+++ b/searchcore/src/tests/proton/docsummary/docsummary.cpp
@@ -34,6 +34,7 @@
#include <vespa/vespalib/encoding/base64.h>
#include <vespa/config-bucketspaces.h>
#include <vespa/vespalib/testkit/testapp.h>
+#include <regex>
#include <vespa/log/log.h>
LOG_SETUP("docsummary_test");
@@ -655,7 +656,8 @@ Test::requireThatSummariesTimeout()
vespalib::SimpleBuffer buf;
vespalib::Slime summary = getSlime(*rep, 0, false);
JsonFormat::encode(summary, buf, false);
- EXPECT_TRUE(vespalib::Regexp("Timed out with -[0-9]+us left.").match(buf.get().make_stringref()));
+ auto bufstring = buf.get().make_stringref();
+ EXPECT_TRUE(std::regex_search(bufstring.data(), bufstring.data() + bufstring.size(), std::basic_regex<char>("Timed out with -[0-9]+us left.")));
}
void
diff --git a/searchlib/src/main/java/com/yahoo/searchlib/rankingexpression/Reference.java b/searchlib/src/main/java/com/yahoo/searchlib/rankingexpression/Reference.java
index cd5f42ac05c..829a796eee0 100644
--- a/searchlib/src/main/java/com/yahoo/searchlib/rankingexpression/Reference.java
+++ b/searchlib/src/main/java/com/yahoo/searchlib/rankingexpression/Reference.java
@@ -21,11 +21,11 @@ import java.util.stream.Collectors;
*/
public class Reference extends TypeContext.Name {
+ private final int hashCode;
+
private final Arguments arguments;
- /**
- * The output, or null if none
- */
+ /** The output, or null if none */
private final String output;
public Reference(String name, Arguments arguments, String output) {
@@ -34,6 +34,7 @@ public class Reference extends TypeContext.Name {
Objects.requireNonNull(arguments, "arguments cannot be null");
this.arguments = arguments;
this.output = output;
+ this.hashCode = Objects.hash(name(), arguments, output);
}
public Arguments arguments() { return arguments; }
@@ -115,7 +116,8 @@ public class Reference extends TypeContext.Name {
@Override
public boolean equals(Object o) {
if (o == this) return true;
- if (!(o instanceof Reference)) return false;
+ if (o.hashCode() != this.hashCode()) return false; // because this has a fast hashCode
+ if ( ! (o instanceof Reference)) return false;
Reference other = (Reference) o;
if (!Objects.equals(other.name(), this.name())) return false;
if (!Objects.equals(other.arguments, this.arguments)) return false;
@@ -125,7 +127,7 @@ public class Reference extends TypeContext.Name {
@Override
public int hashCode() {
- return Objects.hash(name(), arguments, output);
+ return hashCode;
}
@Override
diff --git a/simplemetrics/src/main/java/com/yahoo/metrics/simple/Bucket.java b/simplemetrics/src/main/java/com/yahoo/metrics/simple/Bucket.java
index 120d27d37ff..5e7b60411c9 100644
--- a/simplemetrics/src/main/java/com/yahoo/metrics/simple/Bucket.java
+++ b/simplemetrics/src/main/java/com/yahoo/metrics/simple/Bucket.java
@@ -21,7 +21,7 @@ import edu.umd.cs.findbugs.annotations.NonNull;
/**
* An aggregation of data which is only written to from a single thread.
*
- * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a>
+ * @author Steinar Knutsen
*/
public class Bucket {
diff --git a/simplemetrics/src/main/java/com/yahoo/metrics/simple/DimensionCache.java b/simplemetrics/src/main/java/com/yahoo/metrics/simple/DimensionCache.java
index 0318368a31c..8893a88d94c 100644
--- a/simplemetrics/src/main/java/com/yahoo/metrics/simple/DimensionCache.java
+++ b/simplemetrics/src/main/java/com/yahoo/metrics/simple/DimensionCache.java
@@ -13,7 +13,7 @@ import java.util.Set;
* The persistence layer for metrics. Both CPU and memory hungry, but
* it runs in its own little world.
*
- * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a>
+ * @author Steinar Knutsen
*/
class DimensionCache {
@@ -74,10 +74,8 @@ class DimensionCache {
private static final long MAX_AGE_MILLIS = 4 * 3600 * 1000;
- private void padMetric(String metric,
- Bucket toPresent,
- int currentDataPoints) {
- final LinkedHashMap<Point, TimeStampedMetric> cachedPoints = getCachedMetric(metric);
+ private void padMetric(String metric, Bucket toPresent, int currentDataPoints) {
+ LinkedHashMap<Point, TimeStampedMetric> cachedPoints = getCachedMetric(metric);
int toAdd = pointsToKeep - currentDataPoints;
@SuppressWarnings({"unchecked","rawtypes"})
Entry<Point, TimeStampedMetric>[] cachedEntries = cachedPoints.entrySet().toArray(new Entry[0]);
@@ -87,8 +85,8 @@ class DimensionCache {
if (leastOld.getValue().millis + MAX_AGE_MILLIS < nowMillis) {
continue;
}
- final Identifier id = new Identifier(metric, leastOld.getKey());
- if (!toPresent.hasIdentifier(id)) {
+ Identifier id = new Identifier(metric, leastOld.getKey());
+ if ( ! toPresent.hasIdentifier(id)) {
toPresent.put(id, leastOld.getValue().metric.pruneData());
--toAdd;
}
@@ -99,7 +97,7 @@ class DimensionCache {
private LinkedHashMap<Point, TimeStampedMetric> getCachedMetric(String metricName) {
LinkedHashMap<Point, TimeStampedMetric> points = persistentData.get(metricName);
if (points == null) {
- points = new LinkedHashMap<Point, TimeStampedMetric>(16, 0.75f, false) {
+ points = new LinkedHashMap<>(16, 0.75f, false) {
protected @Override boolean removeEldestEntry(Map.Entry<Point, TimeStampedMetric> eldest) {
return size() > pointsToKeep;
}
diff --git a/simplemetrics/src/main/java/com/yahoo/metrics/simple/Measurement.java b/simplemetrics/src/main/java/com/yahoo/metrics/simple/Measurement.java
index dee98024dda..cc7a4b0f717 100644
--- a/simplemetrics/src/main/java/com/yahoo/metrics/simple/Measurement.java
+++ b/simplemetrics/src/main/java/com/yahoo/metrics/simple/Measurement.java
@@ -5,7 +5,7 @@ package com.yahoo.metrics.simple;
* Wrapper class for the actually measured value. Candidate for removal, but I
* wanted a type instead of some opaque instance of Number.
*
- * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a>
+ * @author Steinar Knutsen
*/
public class Measurement {
private final Number magnitude;
diff --git a/simplemetrics/src/main/java/com/yahoo/metrics/simple/MetricAggregator.java b/simplemetrics/src/main/java/com/yahoo/metrics/simple/MetricAggregator.java
index 42af115bae9..7d142a8e4d8 100644
--- a/simplemetrics/src/main/java/com/yahoo/metrics/simple/MetricAggregator.java
+++ b/simplemetrics/src/main/java/com/yahoo/metrics/simple/MetricAggregator.java
@@ -12,7 +12,7 @@ import com.yahoo.metrics.ManagerConfig;
* snapshots for external consumption. Using the correct executor gives the
* necessary guarantuees for this being invoked from only a single thread.
*
- * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a>
+ * @author Steinar Knutsen
*/
class MetricAggregator implements Runnable {
@@ -44,7 +44,7 @@ class MetricAggregator implements Runnable {
}
private void createSnapshot(Bucket toDelete) {
- final Bucket toPresent = new Bucket();
+ Bucket toPresent = new Bucket();
for (Bucket b : buffer) {
if (b == null) {
continue;
@@ -57,8 +57,8 @@ class MetricAggregator implements Runnable {
private Bucket updateBuffer() {
List<Bucket> buckets = metricsCollection.fetch();
- final long toMillis = System.currentTimeMillis();
- final int bucketIndex = generation++ % buffer.length;
+ long toMillis = System.currentTimeMillis();
+ int bucketIndex = generation++ % buffer.length;
Bucket bucketToDelete = buffer[bucketIndex];
Bucket latest = new Bucket(fromMillis, toMillis);
for (Bucket b : buckets) {
diff --git a/simplemetrics/src/main/java/com/yahoo/metrics/simple/MetricReceiver.java b/simplemetrics/src/main/java/com/yahoo/metrics/simple/MetricReceiver.java
index e6e41ace04a..a2b82978a26 100644
--- a/simplemetrics/src/main/java/com/yahoo/metrics/simple/MetricReceiver.java
+++ b/simplemetrics/src/main/java/com/yahoo/metrics/simple/MetricReceiver.java
@@ -15,7 +15,7 @@ import com.yahoo.concurrent.ThreadLocalDirectory;
* in constructors for declaring instances of {@link Counter} and {@link Gauge}
* for the actual measurement of metrics.
*
- * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a>
+ * @author Steinar Knutsen
*/
@Beta
public class MetricReceiver {
diff --git a/simplemetrics/src/main/java/com/yahoo/metrics/simple/Point.java b/simplemetrics/src/main/java/com/yahoo/metrics/simple/Point.java
index 5dc54c28ba0..672d05c1874 100644
--- a/simplemetrics/src/main/java/com/yahoo/metrics/simple/Point.java
+++ b/simplemetrics/src/main/java/com/yahoo/metrics/simple/Point.java
@@ -129,4 +129,5 @@ public final class Point implements Context {
Value[] getLocation() {
return location;
}
+
}
diff --git a/simplemetrics/src/main/java/com/yahoo/metrics/simple/Sample.java b/simplemetrics/src/main/java/com/yahoo/metrics/simple/Sample.java
index d55dce7bd79..837e93de09a 100644
--- a/simplemetrics/src/main/java/com/yahoo/metrics/simple/Sample.java
+++ b/simplemetrics/src/main/java/com/yahoo/metrics/simple/Sample.java
@@ -7,9 +7,10 @@ import com.yahoo.metrics.simple.UntypedMetric.AssumedType;
* A single metric measurement and all the meta data needed to route it
* correctly.
*
- * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a>
+ * @author Steinar Knutsen
*/
public class Sample {
+
private final Identifier identifier;
private final Measurement measurement;
private final AssumedType metricType;
diff --git a/simplemetrics/src/main/java/com/yahoo/metrics/simple/jdisc/SimpleMetricConsumer.java b/simplemetrics/src/main/java/com/yahoo/metrics/simple/jdisc/SimpleMetricConsumer.java
index 66d7e0e7c2b..ee5f18e78d3 100644
--- a/simplemetrics/src/main/java/com/yahoo/metrics/simple/jdisc/SimpleMetricConsumer.java
+++ b/simplemetrics/src/main/java/com/yahoo/metrics/simple/jdisc/SimpleMetricConsumer.java
@@ -16,7 +16,7 @@ import com.yahoo.metrics.simple.UntypedMetric.AssumedType;
/**
* The single user facing part of the JDisc interfaces of simple metrics.
*
- * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a>
+ * @author Steinar Knutsen
*/
public class SimpleMetricConsumer implements MetricConsumer {
diff --git a/storage/src/tests/distributor/bucketdbupdatertest.cpp b/storage/src/tests/distributor/bucketdbupdatertest.cpp
index b2d554c1e42..9795f5db5dc 100644
--- a/storage/src/tests/distributor/bucketdbupdatertest.cpp
+++ b/storage/src/tests/distributor/bucketdbupdatertest.cpp
@@ -26,6 +26,8 @@ using document::test::makeDocumentBucket;
using document::test::makeBucketSpace;
using document::BucketSpace;
using document::FixedBucketSpaces;
+using document::BucketId;
+using document::Bucket;
namespace storage::distributor {
@@ -112,6 +114,14 @@ class BucketDBUpdaterTest : public CppUnit::TestFixture,
CPPUNIT_TEST(adding_diverging_replica_to_existing_trusted_does_not_remove_trusted);
CPPUNIT_TEST(batch_update_from_distributor_change_does_not_mark_diverging_replicas_as_trusted);
CPPUNIT_TEST(global_distribution_hash_falls_back_to_legacy_format_upon_request_rejection);
+ CPPUNIT_TEST(non_owned_buckets_moved_to_read_only_db_on_ownership_change);
+ CPPUNIT_TEST(buckets_no_longer_available_are_not_moved_to_read_only_database);
+ CPPUNIT_TEST(non_owned_buckets_purged_when_read_only_support_is_config_disabled);
+ CPPUNIT_TEST(deferred_activated_state_does_not_enable_state_until_activation_received);
+ CPPUNIT_TEST(read_only_db_cleared_once_pending_state_is_activated);
+ CPPUNIT_TEST(read_only_db_is_populated_even_when_self_is_marked_down);
+ CPPUNIT_TEST(activate_cluster_state_request_with_mismatching_version_returns_actual_version);
+ CPPUNIT_TEST(activate_cluster_state_request_without_pending_transition_passes_message_through);
CPPUNIT_TEST_SUITE_END();
public:
@@ -123,10 +133,7 @@ protected:
void testDistributorChangeWithGrouping();
void testNormalUsageInitializing();
void testFailedRequestBucketInfo();
- void testNoResponses();
void testBitChange();
- void testInconsistentChecksum();
- void testAddEmptyNode();
void testNodeDown();
void testStorageNodeInMaintenanceClearsBucketsForNode();
void testNodeDownCopiesGetInSync();
@@ -177,6 +184,14 @@ protected:
void adding_diverging_replica_to_existing_trusted_does_not_remove_trusted();
void batch_update_from_distributor_change_does_not_mark_diverging_replicas_as_trusted();
void global_distribution_hash_falls_back_to_legacy_format_upon_request_rejection();
+ void non_owned_buckets_moved_to_read_only_db_on_ownership_change();
+ void buckets_no_longer_available_are_not_moved_to_read_only_database();
+ void non_owned_buckets_purged_when_read_only_support_is_config_disabled();
+ void deferred_activated_state_does_not_enable_state_until_activation_received();
+ void read_only_db_cleared_once_pending_state_is_activated();
+ void read_only_db_is_populated_even_when_self_is_marked_down();
+ void activate_cluster_state_request_with_mismatching_version_returns_actual_version();
+ void activate_cluster_state_request_without_pending_transition_passes_message_through();
auto &defaultDistributorBucketSpace() { return getBucketSpaceRepo().get(makeBucketSpace()); }
@@ -190,13 +205,32 @@ protected:
getBucketDBUpdater().getDistributorComponent().getIndex(),
clusterStateBundle,
"ui"));
- auto &repo = getBucketSpaceRepo();
- for (auto &elem : repo) {
- elem.second->setClusterState(clusterStateBundle.getDerivedClusterState(elem.first));
+ for (auto* repo : {&mutable_repo(), &read_only_repo()}) {
+ for (auto& space : *repo) {
+ space.second->setClusterState(clusterStateBundle.getDerivedClusterState(space.first));
+ }
}
return clusterInfo;
}
+ DistributorBucketSpaceRepo& mutable_repo() noexcept { return getBucketSpaceRepo(); }
+ // Note: not calling this "immutable_repo" since it may actually be modified by the pending
+ // cluster state component (just not by operations), so it would not have the expected semantics.
+ DistributorBucketSpaceRepo& read_only_repo() noexcept { return getReadOnlyBucketSpaceRepo(); }
+
+ BucketDatabase& mutable_default_db() noexcept {
+ return mutable_repo().get(FixedBucketSpaces::default_space()).getBucketDatabase();
+ }
+ BucketDatabase& mutable_global_db() noexcept {
+ return mutable_repo().get(FixedBucketSpaces::global_space()).getBucketDatabase();
+ }
+ BucketDatabase& read_only_default_db() noexcept {
+ return read_only_repo().get(FixedBucketSpaces::default_space()).getBucketDatabase();
+ }
+ BucketDatabase& read_only_global_db() noexcept {
+ return read_only_repo().get(FixedBucketSpaces::global_space()).getBucketDatabase();
+ }
+
static std::string getNodeList(std::vector<uint16_t> nodes, size_t count);
std::string getNodeList(std::vector<uint16_t> nodes);
@@ -210,11 +244,17 @@ protected:
return messagesPerBucketSpace * _bucketSpaces.size();
}
+ void trigger_completed_but_not_yet_activated_transition(
+ vespalib::stringref initial_state, uint32_t initial_buckets, uint32_t initial_expected_msgs,
+ vespalib::stringref pending_state, uint32_t pending_buckets, uint32_t pending_expected_msgs);
+
public:
using OutdatedNodesMap = dbtransition::OutdatedNodesMap;
void setUp() override {
createLinks();
_bucketSpaces = getBucketSpaces();
+ // Disable deferred activation by default (at least for now) to avoid breaking the entire world.
+ getConfig().setAllowStaleReadsDuringClusterStateTransitions(false);
};
void tearDown() override {
@@ -228,7 +268,7 @@ public:
uint32_t bucketCount,
uint32_t invalidBucketCount = 0)
{
- RequestBucketInfoReply* sreply = new RequestBucketInfoReply(cmd);
+ auto sreply = std::make_shared<RequestBucketInfoReply>(cmd);
sreply->setAddress(storageAddress(storageIndex));
api::RequestBucketInfoReply::EntryVector &vec = sreply->getBucketInfo();
@@ -261,7 +301,7 @@ public:
}
}
- return std::shared_ptr<api::RequestBucketInfoReply>(sreply);
+ return sreply;
}
void fakeBucketReply(const lib::ClusterState &state,
@@ -371,8 +411,7 @@ public:
void setSystemState(const lib::ClusterState& state) {
const size_t sizeBeforeState = _sender.commands.size();
getBucketDBUpdater().onSetSystemState(
- std::shared_ptr<api::SetSystemStateCommand>(
- new api::SetSystemStateCommand(state)));
+ std::make_shared<api::SetSystemStateCommand>(state));
// A lot of test logic has the assumption that all messages sent as a
// result of cluster state changes will be in increasing index order
// (for simplicity, not because this is required for correctness).
@@ -381,6 +420,26 @@ public:
sortSentMessagesByIndex(_sender, sizeBeforeState);
}
+ void set_cluster_state_bundle(const lib::ClusterStateBundle& state) {
+ const size_t sizeBeforeState = _sender.commands.size();
+ getBucketDBUpdater().onSetSystemState(
+ std::make_shared<api::SetSystemStateCommand>(state));
+ sortSentMessagesByIndex(_sender, sizeBeforeState);
+ }
+
+ bool activate_cluster_state_version(uint32_t version) {
+ return getBucketDBUpdater().onActivateClusterStateVersion(
+ std::make_shared<api::ActivateClusterStateVersionCommand>(version));
+ }
+
+ void assert_has_activate_cluster_state_reply_with_actual_version(uint32_t version) {
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _sender.replies.size());
+ auto* response = dynamic_cast<api::ActivateClusterStateVersionReply*>(_sender.replies.back().get());
+ CPPUNIT_ASSERT(response != nullptr);
+ CPPUNIT_ASSERT_EQUAL(version, response->actualVersion());
+ _sender.clear();
+ }
+
void completeBucketInfoGathering(const lib::ClusterState& state,
size_t expectedMsgs,
uint32_t bucketCount = 1,
@@ -586,8 +645,9 @@ public:
OutdatedNodesMap outdatedNodesMap;
state = PendingClusterState::createForClusterStateChange(
- clock, clusterInfo, sender, owner.getBucketSpaceRepo(), cmd, outdatedNodesMap,
- api::Timestamp(1));
+ clock, clusterInfo, sender,
+ owner.getBucketSpaceRepo(), owner.getReadOnlyBucketSpaceRepo(),
+ cmd, outdatedNodesMap, api::Timestamp(1));
}
PendingClusterStateFixture(
@@ -598,23 +658,22 @@ public:
owner.createClusterInfo(oldClusterState));
state = PendingClusterState::createForDistributionChange(
- clock, clusterInfo, sender, owner.getBucketSpaceRepo(), api::Timestamp(1));
+ clock, clusterInfo, sender, owner.getBucketSpaceRepo(),
+ owner.getReadOnlyBucketSpaceRepo(), api::Timestamp(1));
}
};
- auto createPendingStateFixtureForStateChange(
+ std::unique_ptr<PendingClusterStateFixture> createPendingStateFixtureForStateChange(
const std::string& oldClusterState,
const std::string& newClusterState)
{
- return std::make_unique<PendingClusterStateFixture>(
- *this, oldClusterState, newClusterState);
+ return std::make_unique<PendingClusterStateFixture>(*this, oldClusterState, newClusterState);
}
- auto createPendingStateFixtureForDistributionChange(
+ std::unique_ptr<PendingClusterStateFixture> createPendingStateFixtureForDistributionChange(
const std::string& oldClusterState)
{
- return std::make_unique<PendingClusterStateFixture>(
- *this, oldClusterState);
+ return std::make_unique<PendingClusterStateFixture>(*this, oldClusterState);
}
};
@@ -622,8 +681,8 @@ CPPUNIT_TEST_SUITE_REGISTRATION(BucketDBUpdaterTest);
BucketDBUpdaterTest::BucketDBUpdaterTest()
: CppUnit::TestFixture(),
- DistributorTestUtil(),
- _bucketSpaces()
+ DistributorTestUtil(),
+ _bucketSpaces()
{
}
@@ -1533,7 +1592,8 @@ BucketDBUpdaterTest::getSentNodesDistributionChanged(
ClusterInformation::CSP clusterInfo(createClusterInfo(oldClusterState));
std::unique_ptr<PendingClusterState> state(
PendingClusterState::createForDistributionChange(
- clock, clusterInfo, sender, getBucketSpaceRepo(), api::Timestamp(1)));
+ clock, clusterInfo, sender, getBucketSpaceRepo(),
+ getReadOnlyBucketSpaceRepo(), api::Timestamp(1)));
sortSentMessagesByIndex(sender);
@@ -1698,8 +1758,8 @@ BucketDBUpdaterTest::testPendingClusterStateReceive()
OutdatedNodesMap outdatedNodesMap;
std::unique_ptr<PendingClusterState> state(
PendingClusterState::createForClusterStateChange(
- clock, clusterInfo, sender, getBucketSpaceRepo(), cmd, outdatedNodesMap,
- api::Timestamp(1)));
+ clock, clusterInfo, sender, getBucketSpaceRepo(), getReadOnlyBucketSpaceRepo(),
+ cmd, outdatedNodesMap, api::Timestamp(1)));
CPPUNIT_ASSERT_EQUAL(messageCount(3), sender.commands.size());
@@ -1863,8 +1923,8 @@ BucketDBUpdaterTest::mergeBucketLists(
ClusterInformation::CSP clusterInfo(createClusterInfo("cluster:d"));
std::unique_ptr<PendingClusterState> state(
PendingClusterState::createForClusterStateChange(
- clock, clusterInfo, sender, getBucketSpaceRepo(), cmd, outdatedNodesMap,
- beforeTime));
+ clock, clusterInfo, sender, getBucketSpaceRepo(), getReadOnlyBucketSpaceRepo(),
+ cmd, outdatedNodesMap, beforeTime));
parseInputData(existingData, beforeTime, *state, includeBucketInfo);
state->mergeIntoBucketDatabases();
@@ -1882,8 +1942,8 @@ BucketDBUpdaterTest::mergeBucketLists(
ClusterInformation::CSP clusterInfo(createClusterInfo(oldState.toString()));
std::unique_ptr<PendingClusterState> state(
PendingClusterState::createForClusterStateChange(
- clock, clusterInfo, sender, getBucketSpaceRepo(), cmd, outdatedNodesMap,
- afterTime));
+ clock, clusterInfo, sender, getBucketSpaceRepo(), getReadOnlyBucketSpaceRepo(),
+ cmd, outdatedNodesMap, afterTime));
parseInputData(newData, afterTime, *state, includeBucketInfo);
state->mergeIntoBucketDatabases();
@@ -2599,4 +2659,192 @@ void BucketDBUpdaterTest::global_distribution_hash_falls_back_to_legacy_format_u
CPPUNIT_ASSERT_EQUAL(current_hash, new_current_req.getDistributionHash());
}
+namespace {
+
+template <typename Func>
+void for_each_bucket(const BucketDatabase& db, const document::BucketSpace& space, Func&& f) {
+ BucketId last(0);
+ auto e = db.getNext(last);
+ while (e.valid()) {
+ f(space, e);
+ e = db.getNext(e.getBucketId());
+ }
+}
+
+template <typename Func>
+void for_each_bucket(const DistributorBucketSpaceRepo& repo, Func&& f) {
+ for (const auto& space : repo) {
+ for_each_bucket(space.second->getBucketDatabase(), space.first, f);
+ }
+}
+
+}
+
+using ConfigBuilder = vespa::config::content::core::StorDistributormanagerConfigBuilder;
+
+void BucketDBUpdaterTest::non_owned_buckets_moved_to_read_only_db_on_ownership_change() {
+ getConfig().setAllowStaleReadsDuringClusterStateTransitions(true);
+
+ lib::ClusterState initial_state("distributor:1 storage:4"); // All buckets owned by us by definition
+ set_cluster_state_bundle(lib::ClusterStateBundle(initial_state, {}, false)); // Skip activation step for simplicity
+
+ CPPUNIT_ASSERT_EQUAL(messageCount(4), _sender.commands.size());
+ constexpr uint32_t n_buckets = 10;
+ completeBucketInfoGathering(initial_state, messageCount(4), n_buckets);
+ _sender.clear();
+
+ CPPUNIT_ASSERT_EQUAL(size_t(n_buckets), mutable_default_db().size());
+ CPPUNIT_ASSERT_EQUAL(size_t(n_buckets), mutable_global_db().size());
+ CPPUNIT_ASSERT_EQUAL(size_t(0), read_only_default_db().size());
+ CPPUNIT_ASSERT_EQUAL(size_t(0), read_only_global_db().size());
+
+ lib::ClusterState pending_state("distributor:2 storage:4");
+
+ std::unordered_set<Bucket, Bucket::hash> buckets_not_owned_in_pending_state;
+ for_each_bucket(mutable_repo(), [&](const auto& space, const auto& entry) {
+ if (!getBucketDBUpdater().getDistributorComponent()
+ .ownsBucketInState(pending_state, makeDocumentBucket(entry.getBucketId()))) {
+ buckets_not_owned_in_pending_state.insert(Bucket(space, entry.getBucketId()));
+ }
+ });
+ CPPUNIT_ASSERT(!buckets_not_owned_in_pending_state.empty());
+
+ set_cluster_state_bundle(lib::ClusterStateBundle(pending_state, {}, true)); // Now requires activation
+
+ const auto buckets_not_owned_per_space = (buckets_not_owned_in_pending_state.size() / 2); // 2 spaces
+ const auto expected_mutable_buckets = n_buckets - buckets_not_owned_per_space;
+ CPPUNIT_ASSERT_EQUAL(expected_mutable_buckets, mutable_default_db().size());
+ CPPUNIT_ASSERT_EQUAL(expected_mutable_buckets, mutable_global_db().size());
+ CPPUNIT_ASSERT_EQUAL(buckets_not_owned_per_space, read_only_default_db().size());
+ CPPUNIT_ASSERT_EQUAL(buckets_not_owned_per_space, read_only_global_db().size());
+
+ for_each_bucket(read_only_repo(), [&](const auto& space, const auto& entry) {
+ CPPUNIT_ASSERT(buckets_not_owned_in_pending_state.find(Bucket(space, entry.getBucketId()))
+ != buckets_not_owned_in_pending_state.end());
+ });
+}
+
+void BucketDBUpdaterTest::buckets_no_longer_available_are_not_moved_to_read_only_database() {
+ constexpr uint32_t n_buckets = 10;
+ // No ownership change, just node down. Test redundancy is 2, so removing 2 nodes will
+ // cause some buckets to be entirely unavailable.
+ trigger_completed_but_not_yet_activated_transition("version:1 distributor:1 storage:4", n_buckets, 4,
+ "version:2 distributor:1 storage:4 .0.s:d .1.s:m", n_buckets, 0);
+
+ CPPUNIT_ASSERT_EQUAL(size_t(0), read_only_default_db().size());
+ CPPUNIT_ASSERT_EQUAL(size_t(0), read_only_global_db().size());
+}
+
+void BucketDBUpdaterTest::non_owned_buckets_purged_when_read_only_support_is_config_disabled() {
+ getConfig().setAllowStaleReadsDuringClusterStateTransitions(false);
+
+ lib::ClusterState initial_state("distributor:1 storage:4"); // All buckets owned by us by definition
+ set_cluster_state_bundle(lib::ClusterStateBundle(initial_state, {}, false)); // Skip activation step for simplicity
+
+ CPPUNIT_ASSERT_EQUAL(messageCount(4), _sender.commands.size());
+ constexpr uint32_t n_buckets = 10;
+ completeBucketInfoGathering(initial_state, messageCount(4), n_buckets);
+ _sender.clear();
+
+ // Nothing in read-only DB after first bulk load of buckets.
+ CPPUNIT_ASSERT_EQUAL(size_t(0), read_only_default_db().size());
+ CPPUNIT_ASSERT_EQUAL(size_t(0), read_only_global_db().size());
+
+ lib::ClusterState pending_state("distributor:2 storage:4");
+ setSystemState(pending_state);
+ // No buckets should be moved into read only db after ownership changes.
+ CPPUNIT_ASSERT_EQUAL(size_t(0), read_only_default_db().size());
+ CPPUNIT_ASSERT_EQUAL(size_t(0), read_only_global_db().size());
+}
+
+void BucketDBUpdaterTest::trigger_completed_but_not_yet_activated_transition(
+ vespalib::stringref initial_state_str,
+ uint32_t initial_buckets,
+ uint32_t initial_expected_msgs,
+ vespalib::stringref pending_state_str,
+ uint32_t pending_buckets,
+ uint32_t pending_expected_msgs)
+{
+ getConfig().setAllowStaleReadsDuringClusterStateTransitions(true);
+ lib::ClusterState initial_state(initial_state_str);
+ setSystemState(initial_state);
+ CPPUNIT_ASSERT_EQUAL(messageCount(initial_expected_msgs), _sender.commands.size());
+ completeBucketInfoGathering(initial_state, messageCount(initial_expected_msgs), initial_buckets);
+ _sender.clear();
+
+ lib::ClusterState pending_state(pending_state_str); // Ownership change
+ set_cluster_state_bundle(lib::ClusterStateBundle(pending_state, {}, true));
+ CPPUNIT_ASSERT_EQUAL(messageCount(pending_expected_msgs), _sender.commands.size());
+ completeBucketInfoGathering(pending_state, messageCount(pending_expected_msgs), pending_buckets);
+ _sender.clear();
+}
+
+void BucketDBUpdaterTest::deferred_activated_state_does_not_enable_state_until_activation_received() {
+ constexpr uint32_t n_buckets = 10;
+ trigger_completed_but_not_yet_activated_transition("version:1 distributor:2 storage:4", 0, 4,
+ "version:2 distributor:1 storage:4", n_buckets, 4);
+
+ // Version should not be switched over yet
+ CPPUNIT_ASSERT_EQUAL(uint32_t(1), getDistributor().getClusterStateBundle().getVersion());
+
+ CPPUNIT_ASSERT_EQUAL(uint64_t(0), mutable_default_db().size());
+ CPPUNIT_ASSERT_EQUAL(uint64_t(0), mutable_global_db().size());
+
+ CPPUNIT_ASSERT(!activate_cluster_state_version(2));
+
+ CPPUNIT_ASSERT_EQUAL(uint32_t(2), getDistributor().getClusterStateBundle().getVersion());
+ CPPUNIT_ASSERT_EQUAL(uint64_t(n_buckets), mutable_default_db().size());
+ CPPUNIT_ASSERT_EQUAL(uint64_t(n_buckets), mutable_global_db().size());
+}
+
+void BucketDBUpdaterTest::read_only_db_cleared_once_pending_state_is_activated() {
+ constexpr uint32_t n_buckets = 10;
+ trigger_completed_but_not_yet_activated_transition("version:1 distributor:1 storage:4", n_buckets, 4,
+ "version:2 distributor:2 storage:4", n_buckets, 0);
+ CPPUNIT_ASSERT(!activate_cluster_state_version(2));
+
+ CPPUNIT_ASSERT_EQUAL(uint64_t(0), read_only_default_db().size());
+ CPPUNIT_ASSERT_EQUAL(uint64_t(0), read_only_global_db().size());
+}
+
+void BucketDBUpdaterTest::read_only_db_is_populated_even_when_self_is_marked_down() {
+ constexpr uint32_t n_buckets = 10;
+ trigger_completed_but_not_yet_activated_transition("version:1 distributor:1 storage:4", n_buckets, 4,
+ "version:2 distributor:1 .0.s:d storage:4", n_buckets, 0);
+
+ // State not yet activated, so read-only DBs have got all the buckets we used to have.
+ CPPUNIT_ASSERT_EQUAL(uint64_t(0), mutable_default_db().size());
+ CPPUNIT_ASSERT_EQUAL(uint64_t(0), mutable_global_db().size());
+ CPPUNIT_ASSERT_EQUAL(uint64_t(n_buckets), read_only_default_db().size());
+ CPPUNIT_ASSERT_EQUAL(uint64_t(n_buckets), read_only_global_db().size());
+}
+
+void BucketDBUpdaterTest::activate_cluster_state_request_with_mismatching_version_returns_actual_version() {
+ constexpr uint32_t n_buckets = 10;
+ trigger_completed_but_not_yet_activated_transition("version:4 distributor:1 storage:4", n_buckets, 4,
+ "version:5 distributor:2 storage:4", n_buckets, 0);
+
+ CPPUNIT_ASSERT(activate_cluster_state_version(4)); // Too old version
+ assert_has_activate_cluster_state_reply_with_actual_version(5);
+
+ CPPUNIT_ASSERT(activate_cluster_state_version(6)); // More recent version than what has been observed
+ assert_has_activate_cluster_state_reply_with_actual_version(5);
+}
+
+void BucketDBUpdaterTest::activate_cluster_state_request_without_pending_transition_passes_message_through() {
+ constexpr uint32_t n_buckets = 10;
+ trigger_completed_but_not_yet_activated_transition("version:1 distributor:2 storage:4", 0, 4,
+ "version:2 distributor:1 storage:4", n_buckets, 4);
+ // Activate version 2; no pending cluster state after this.
+ CPPUNIT_ASSERT(!activate_cluster_state_version(2));
+
+ // No pending cluster state for version 3, just passed through to be implicitly bounced by state manager.
+ // Note: state manager is not modelled in this test, so we just check that the message handler returns
+ // false (meaning "didn't take message ownership") and there's no auto-generated reply.
+ CPPUNIT_ASSERT(!activate_cluster_state_version(3));
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _sender.replies.size());
+}
+
+// TODO rename distributor config to imply two phase functionlity explicitly?
+
}
diff --git a/storage/src/tests/distributor/distributortestutil.cpp b/storage/src/tests/distributor/distributortestutil.cpp
index d3496d0c9f6..3f7f2eac63a 100644
--- a/storage/src/tests/distributor/distributortestutil.cpp
+++ b/storage/src/tests/distributor/distributortestutil.cpp
@@ -388,6 +388,16 @@ DistributorTestUtil::getBucketSpaceRepo() const {
return _distributor->getBucketSpaceRepo();
}
+DistributorBucketSpaceRepo &
+DistributorTestUtil::getReadOnlyBucketSpaceRepo() {
+ return _distributor->getReadOnlyBucketSpaceRepo();
+}
+
+const DistributorBucketSpaceRepo &
+DistributorTestUtil::getReadOnlyBucketSpaceRepo() const {
+ return _distributor->getReadOnlyBucketSpaceRepo();
+}
+
const lib::Distribution&
DistributorTestUtil::getDistribution() const {
return getBucketSpaceRepo().get(makeBucketSpace()).getDistribution();
diff --git a/storage/src/tests/distributor/distributortestutil.h b/storage/src/tests/distributor/distributortestutil.h
index 10cc5eeaca1..420111437d2 100644
--- a/storage/src/tests/distributor/distributortestutil.h
+++ b/storage/src/tests/distributor/distributortestutil.h
@@ -132,6 +132,8 @@ public:
const BucketDatabase& getBucketDatabase(document::BucketSpace space) const;
DistributorBucketSpaceRepo &getBucketSpaceRepo();
const DistributorBucketSpaceRepo &getBucketSpaceRepo() const;
+ DistributorBucketSpaceRepo& getReadOnlyBucketSpaceRepo();
+ const DistributorBucketSpaceRepo& getReadOnlyBucketSpaceRepo() const;
const lib::Distribution& getDistribution() const;
// "End to end" distribution change trigger, which will invoke the bucket
diff --git a/storage/src/tests/distributor/externaloperationhandlertest.cpp b/storage/src/tests/distributor/externaloperationhandlertest.cpp
index ddf88f50c36..40fe885dcb1 100644
--- a/storage/src/tests/distributor/externaloperationhandlertest.cpp
+++ b/storage/src/tests/distributor/externaloperationhandlertest.cpp
@@ -4,6 +4,7 @@
#include <vespa/storage/distributor/externaloperationhandler.h>
#include <vespa/storage/distributor/distributor.h>
#include <vespa/storage/distributor/distributormetricsset.h>
+#include <vespa/storage/distributor/operations/external/getoperation.h>
#include <vespa/storageapi/message/persistence.h>
#include <vespa/document/repo/documenttyperepo.h>
#include <vespa/document/update/documentupdate.h>
@@ -20,8 +21,11 @@ class ExternalOperationHandlerTest : public CppUnit::TestFixture,
CPPUNIT_TEST_SUITE(ExternalOperationHandlerTest);
CPPUNIT_TEST(testBucketSplitMask);
- CPPUNIT_TEST(testOperationRejectedOnWrongDistribution);
- CPPUNIT_TEST(testOperationRejectedOnPendingWrongDistribution);
+ CPPUNIT_TEST(mutating_operation_wdr_bounced_on_wrong_current_distribution);
+ CPPUNIT_TEST(mutating_operation_busy_bounced_on_wrong_pending_distribution);
+ CPPUNIT_TEST(mutating_operation_busy_bounced_if_no_cluster_state_received_yet);
+ CPPUNIT_TEST(read_only_operation_wdr_bounced_on_wrong_current_distribution);
+ CPPUNIT_TEST(read_only_operation_busy_bounced_if_no_cluster_state_received_yet);
CPPUNIT_TEST(reject_put_if_not_past_safe_time_point);
CPPUNIT_TEST(reject_remove_if_not_past_safe_time_point);
CPPUNIT_TEST(reject_update_if_not_past_safe_time_point);
@@ -37,6 +41,9 @@ class ExternalOperationHandlerTest : public CppUnit::TestFixture,
CPPUNIT_TEST(concurrent_get_and_mutation_do_not_conflict);
CPPUNIT_TEST(sequencing_works_across_mutation_types);
CPPUNIT_TEST(sequencing_can_be_explicitly_config_disabled);
+ CPPUNIT_TEST(gets_are_started_with_mutable_db_outside_transition_period);
+ CPPUNIT_TEST(gets_are_started_with_read_only_db_during_transition_period);
+ CPPUNIT_TEST(gets_are_busy_bounced_during_transition_period_if_stale_reads_disabled);
CPPUNIT_TEST_SUITE_END();
document::BucketId findNonOwnedUserBucketInState(vespalib::stringref state);
@@ -49,10 +56,13 @@ class ExternalOperationHandlerTest : public CppUnit::TestFixture,
std::shared_ptr<api::UpdateCommand> makeUpdateCommand(const vespalib::string& doc_type,
const vespalib::string& id) const;
std::shared_ptr<api::UpdateCommand> makeUpdateCommand() const;
+ std::shared_ptr<api::UpdateCommand> makeUpdateCommandForUser(uint64_t id) const;
std::shared_ptr<api::PutCommand> makePutCommand(const vespalib::string& doc_type,
const vespalib::string& id) const;
std::shared_ptr<api::RemoveCommand> makeRemoveCommand(const vespalib::string& id) const;
+ void verify_busy_bounced_due_to_no_active_state(std::shared_ptr<api::StorageCommand> cmd);
+
Operation::SP start_operation_verify_not_rejected(std::shared_ptr<api::StorageCommand> cmd);
void start_operation_verify_rejected(std::shared_ptr<api::StorageCommand> cmd);
@@ -80,10 +90,16 @@ class ExternalOperationHandlerTest : public CppUnit::TestFixture,
const vespalib::string _dummy_id{"id:foo:testdoctype1::bar"};
+ // Returns an arbitrary bucket not owned in the pending state
+ document::BucketId set_up_pending_cluster_state_transition(bool read_only_enabled);
+
protected:
void testBucketSplitMask();
- void testOperationRejectedOnWrongDistribution();
- void testOperationRejectedOnPendingWrongDistribution();
+ void mutating_operation_wdr_bounced_on_wrong_current_distribution();
+ void mutating_operation_busy_bounced_on_wrong_pending_distribution();
+ void mutating_operation_busy_bounced_if_no_cluster_state_received_yet();
+ void read_only_operation_wdr_bounced_on_wrong_current_distribution();
+ void read_only_operation_busy_bounced_if_no_cluster_state_received_yet();
void reject_put_if_not_past_safe_time_point();
void reject_remove_if_not_past_safe_time_point();
void reject_update_if_not_past_safe_time_point();
@@ -99,6 +115,9 @@ protected:
void concurrent_get_and_mutation_do_not_conflict();
void sequencing_works_across_mutation_types();
void sequencing_can_be_explicitly_config_disabled();
+ void gets_are_started_with_mutable_db_outside_transition_period();
+ void gets_are_started_with_read_only_db_during_transition_period();
+ void gets_are_busy_bounced_during_transition_period_if_stale_reads_disabled();
void assert_rejection_due_to_unsafe_time(
std::shared_ptr<api::StorageCommand> cmd);
@@ -220,6 +239,11 @@ ExternalOperationHandlerTest::makeUpdateCommand() const {
return makeUpdateCommand("testdoctype1", "id:foo:testdoctype1::baz");
}
+std::shared_ptr<api::UpdateCommand>
+ExternalOperationHandlerTest::makeUpdateCommandForUser(uint64_t id) const {
+ return makeUpdateCommand("testdoctype1", vespalib::make_string("id::testdoctype1:n=%" PRIu64 ":bar", id));
+}
+
std::shared_ptr<api::PutCommand> ExternalOperationHandlerTest::makePutCommand(
const vespalib::string& doc_type,
const vespalib::string& id) const {
@@ -233,10 +257,30 @@ std::shared_ptr<api::RemoveCommand> ExternalOperationHandlerTest::makeRemoveComm
}
void
-ExternalOperationHandlerTest::testOperationRejectedOnWrongDistribution()
+ExternalOperationHandlerTest::mutating_operation_wdr_bounced_on_wrong_current_distribution()
{
createLinks();
- std::string state("distributor:2 storage:2");
+ std::string state("version:1 distributor:2 storage:2");
+ setupDistributor(1, 2, state);
+
+ document::BucketId bucket(findNonOwnedUserBucketInState(state));
+ auto cmd = makeUpdateCommandForUser(bucket.withoutCountBits());
+
+ Operation::SP genOp;
+ CPPUNIT_ASSERT(getExternalOperationHandler().handleMessage(cmd, genOp));
+ CPPUNIT_ASSERT(!genOp.get());
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _sender.replies.size());
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("ReturnCode(WRONG_DISTRIBUTION, "
+ "version:1 distributor:2 storage:2)"),
+ _sender.replies[0]->getResult().toString());
+}
+
+void
+ExternalOperationHandlerTest::read_only_operation_wdr_bounced_on_wrong_current_distribution()
+{
+ createLinks();
+ std::string state("version:1 distributor:2 storage:2");
setupDistributor(1, 2, state);
document::BucketId bucket(findNonOwnedUserBucketInState(state));
@@ -248,43 +292,65 @@ ExternalOperationHandlerTest::testOperationRejectedOnWrongDistribution()
CPPUNIT_ASSERT_EQUAL(size_t(1), _sender.replies.size());
CPPUNIT_ASSERT_EQUAL(
std::string("ReturnCode(WRONG_DISTRIBUTION, "
- "distributor:2 storage:2)"),
+ "version:1 distributor:2 storage:2)"),
_sender.replies[0]->getResult().toString());
}
void
-ExternalOperationHandlerTest::testOperationRejectedOnPendingWrongDistribution()
+ExternalOperationHandlerTest::mutating_operation_busy_bounced_on_wrong_pending_distribution()
{
createLinks();
- std::string current("distributor:2 storage:2");
- std::string pending("distributor:3 storage:3");
+ std::string current("version:10 distributor:2 storage:2");
+ std::string pending("version:11 distributor:3 storage:3");
setupDistributor(1, 3, current);
document::BucketId b(findOwned1stNotOwned2ndInStates(current, pending));
// Trigger pending cluster state
- auto stateCmd = std::make_shared<api::SetSystemStateCommand>(
- lib::ClusterState(pending));
+ auto stateCmd = std::make_shared<api::SetSystemStateCommand>(lib::ClusterState(pending));
getBucketDBUpdater().onSetSystemState(stateCmd);
- auto cmd = makeGetCommandForUser(b.withoutCountBits());
+ auto cmd = makeUpdateCommandForUser(b.withoutCountBits());
Operation::SP genOp;
CPPUNIT_ASSERT(getExternalOperationHandler().handleMessage(cmd, genOp));
CPPUNIT_ASSERT(!genOp.get());
CPPUNIT_ASSERT_EQUAL(size_t(1), _sender.replies.size());
- // Fail back with _pending_ cluster state so client can start trying
- // correct distributor immediately. If that distributor has not yet
- // completed processing its pending cluster state, it'll return the
- // old (current) cluster state, causing the client to bounce between
- // the two until the pending states have been resolved. This is pretty
- // much inevitable with the current design.
CPPUNIT_ASSERT_EQUAL(
- std::string("ReturnCode(WRONG_DISTRIBUTION, "
- "distributor:3 storage:3)"),
+ std::string("ReturnCode(BUSY, Currently pending cluster state transition from version 10 to 11)"),
_sender.replies[0]->getResult().toString());
}
+void
+ExternalOperationHandlerTest::verify_busy_bounced_due_to_no_active_state(std::shared_ptr<api::StorageCommand> cmd)
+{
+ createLinks();
+ std::string state{}; // No version --> not yet received
+ setupDistributor(1, 2, state);
+
+ Operation::SP genOp;
+ CPPUNIT_ASSERT(getExternalOperationHandler().handleMessage(cmd, genOp));
+ CPPUNIT_ASSERT(!genOp.get());
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _sender.replies.size());
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("ReturnCode(BUSY, No cluster state activated yet)"),
+ _sender.replies[0]->getResult().toString());
+}
+
+// TODO NOT_READY is a more appropriate return code for this case, but must ensure it's
+// handled gracefully and silently through the stack. BUSY is a safe bet until then.
+void
+ExternalOperationHandlerTest::mutating_operation_busy_bounced_if_no_cluster_state_received_yet()
+{
+ verify_busy_bounced_due_to_no_active_state(makeUpdateCommandForUser(12345));
+}
+
+void
+ExternalOperationHandlerTest::read_only_operation_busy_bounced_if_no_cluster_state_received_yet()
+{
+ verify_busy_bounced_due_to_no_active_state(makeGetCommandForUser(12345));
+}
+
using TimePoint = ExternalOperationHandler::TimePoint;
using namespace std::literals::chrono_literals;
@@ -292,7 +358,7 @@ void ExternalOperationHandlerTest::assert_rejection_due_to_unsafe_time(
std::shared_ptr<api::StorageCommand> cmd)
{
createLinks();
- setupDistributor(1, 2, "distributor:1 storage:1");
+ setupDistributor(1, 2, "version:1 distributor:1 storage:1");
getClock().setAbsoluteTimeInSeconds(9);
getExternalOperationHandler().rejectFeedBeforeTimeReached(TimePoint(10s));
@@ -327,7 +393,7 @@ void ExternalOperationHandlerTest::reject_update_if_not_past_safe_time_point() {
void ExternalOperationHandlerTest::get_not_rejected_by_unsafe_time_point() {
createLinks();
- setupDistributor(1, 2, "distributor:1 storage:1");
+ setupDistributor(1, 2, "version:1 distributor:1 storage:1");
getClock().setAbsoluteTimeInSeconds(9);
getExternalOperationHandler().rejectFeedBeforeTimeReached(TimePoint(10s));
@@ -342,7 +408,7 @@ void ExternalOperationHandlerTest::get_not_rejected_by_unsafe_time_point() {
void ExternalOperationHandlerTest::mutation_not_rejected_when_safe_point_reached() {
createLinks();
- setupDistributor(1, 2, "distributor:1 storage:1");
+ setupDistributor(1, 2, "version:1 distributor:1 storage:1");
getClock().setAbsoluteTimeInSeconds(10);
getExternalOperationHandler().rejectFeedBeforeTimeReached(TimePoint(10s));
@@ -360,7 +426,7 @@ void ExternalOperationHandlerTest::mutation_not_rejected_when_safe_point_reached
void ExternalOperationHandlerTest::set_up_distributor_for_sequencing_test() {
createLinks();
- setupDistributor(1, 2, "distributor:1 storage:1");
+ setupDistributor(1, 2, "version:1 distributor:1 storage:1");
}
Operation::SP ExternalOperationHandlerTest::start_operation_verify_not_rejected(
@@ -486,6 +552,52 @@ void ExternalOperationHandlerTest::sequencing_can_be_explicitly_config_disabled(
start_operation_verify_not_rejected(makeRemoveCommand(_dummy_id));
}
+void ExternalOperationHandlerTest::gets_are_started_with_mutable_db_outside_transition_period() {
+ createLinks();
+ std::string current = "version:1 distributor:1 storage:3";
+ setupDistributor(1, 3, current);
+ getConfig().setAllowStaleReadsDuringClusterStateTransitions(true);
+
+ document::BucketId b(16, 1234); // Only 1 distributor (us), so doesn't matter
+
+ auto op = start_operation_verify_not_rejected(makeGetCommandForUser(b.withoutCountBits()));
+ auto& get_op = dynamic_cast<GetOperation&>(*op);
+ const auto* expected_space = &getBucketSpaceRepo().get(document::FixedBucketSpaces::default_space());
+ CPPUNIT_ASSERT_EQUAL(expected_space, &get_op.bucketSpace());
+}
+
+document::BucketId ExternalOperationHandlerTest::set_up_pending_cluster_state_transition(bool read_only_enabled) {
+ createLinks();
+ std::string current = "version:123 distributor:2 storage:2";
+ std::string pending = "version:321 distributor:3 storage:3";
+ setupDistributor(1, 3, current);
+ getConfig().setAllowStaleReadsDuringClusterStateTransitions(read_only_enabled);
+
+ // Trigger pending cluster state
+ auto stateCmd = std::make_shared<api::SetSystemStateCommand>(lib::ClusterState(pending));
+ getBucketDBUpdater().onSetSystemState(stateCmd);
+ return findOwned1stNotOwned2ndInStates(current, pending);
+}
+
+void ExternalOperationHandlerTest::gets_are_started_with_read_only_db_during_transition_period() {
+ auto non_owned_bucket = set_up_pending_cluster_state_transition(true);
+
+ auto op = start_operation_verify_not_rejected(makeGetCommandForUser(non_owned_bucket.withoutCountBits()));
+ auto& get_op = dynamic_cast<GetOperation&>(*op);
+ const auto* expected_space = &getReadOnlyBucketSpaceRepo().get(document::FixedBucketSpaces::default_space());
+ CPPUNIT_ASSERT_EQUAL(expected_space, &get_op.bucketSpace());
+}
+
+void ExternalOperationHandlerTest::gets_are_busy_bounced_during_transition_period_if_stale_reads_disabled() {
+ auto non_owned_bucket = set_up_pending_cluster_state_transition(false);
+
+ start_operation_verify_rejected(makeGetCommandForUser(non_owned_bucket.withoutCountBits()));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("ReturnCode(BUSY, Currently pending cluster state transition from version 123 to 321)"),
+ _sender.replies[0]->getResult().toString());
+
+}
+
// TODO support sequencing of RemoveLocation? It's a mutating operation, but supporting it with
// the current approach is not trivial. A RemoveLocation operation covers the _entire_ bucket
// sub tree under a given location, while the sequencer works on individual GIDs. Mapping the
diff --git a/storage/src/tests/storageserver/bouncertest.cpp b/storage/src/tests/storageserver/bouncertest.cpp
index 27c13a3707e..371c24accbc 100644
--- a/storage/src/tests/storageserver/bouncertest.cpp
+++ b/storage/src/tests/storageserver/bouncertest.cpp
@@ -43,6 +43,7 @@ struct BouncerTest : public CppUnit::TestFixture {
void outOfBoundsConfigValuesThrowException();
void abort_request_when_derived_bucket_space_node_state_is_marked_down();
void client_operations_are_allowed_through_on_cluster_state_down_distributor();
+ void cluster_state_activation_commands_are_not_bounced();
CPPUNIT_TEST_SUITE(BouncerTest);
CPPUNIT_TEST(testFutureTimestamp);
@@ -57,6 +58,7 @@ struct BouncerTest : public CppUnit::TestFixture {
CPPUNIT_TEST(outOfBoundsConfigValuesThrowException);
CPPUNIT_TEST(abort_request_when_derived_bucket_space_node_state_is_marked_down);
CPPUNIT_TEST(client_operations_are_allowed_through_on_cluster_state_down_distributor);
+ CPPUNIT_TEST(cluster_state_activation_commands_are_not_bounced);
CPPUNIT_TEST_SUITE_END();
using Priority = api::StorageMessage::Priority;
@@ -368,5 +370,17 @@ void BouncerTest::client_operations_are_allowed_through_on_cluster_state_down_di
CPPUNIT_ASSERT_EQUAL(uint64_t(0), _manager->metrics().unavailable_node_aborts.getValue());
}
+void BouncerTest::cluster_state_activation_commands_are_not_bounced() {
+ tearDown();
+ setUpAsNode(lib::NodeType::DISTRIBUTOR);
+
+ auto state = makeClusterStateBundle("version:10 distributor:3 .2.s:d storage:3", {}); // Our index (2) is down
+ _node->getNodeStateUpdater().setClusterStateBundle(state);
+
+ auto activate_cmd = std::make_shared<api::ActivateClusterStateVersionCommand>(11);
+ _upper->sendDown(activate_cmd);
+ assertMessageNotBounced();
+}
+
} // storage
diff --git a/storage/src/tests/storageserver/fnet_listener_test.cpp b/storage/src/tests/storageserver/fnet_listener_test.cpp
index 84051041d25..d40b230d725 100644
--- a/storage/src/tests/storageserver/fnet_listener_test.cpp
+++ b/storage/src/tests/storageserver/fnet_listener_test.cpp
@@ -27,6 +27,9 @@ public:
CPPUNIT_TEST(set_distribution_rpc_is_immediately_failed_if_listener_is_closed);
CPPUNIT_TEST(overly_large_uncompressed_bundle_size_parameter_returns_rpc_error);
CPPUNIT_TEST(mismatching_uncompressed_bundle_size_parameter_returns_rpc_error);
+ CPPUNIT_TEST(true_deferred_activation_flag_can_be_roundtrip_encoded);
+ CPPUNIT_TEST(false_deferred_activation_flag_can_be_roundtrip_encoded);
+ CPPUNIT_TEST(activate_cluster_state_version_rpc_enqueues_command_with_version);
CPPUNIT_TEST_SUITE_END();
void baseline_set_distribution_states_rpc_enqueues_command_with_state_bundle();
@@ -35,6 +38,9 @@ public:
void set_distribution_rpc_is_immediately_failed_if_listener_is_closed();
void overly_large_uncompressed_bundle_size_parameter_returns_rpc_error();
void mismatching_uncompressed_bundle_size_parameter_returns_rpc_error();
+ void true_deferred_activation_flag_can_be_roundtrip_encoded();
+ void false_deferred_activation_flag_can_be_roundtrip_encoded();
+ void activate_cluster_state_version_rpc_enqueues_command_with_version();
};
CPPUNIT_TEST_SUITE_REGISTRATION(FNetListenerTest);
@@ -54,24 +60,25 @@ struct DummyReturnHandler : FRT_IReturnHandler {
FNET_Connection* GetConnection() override { return nullptr; }
};
-struct Fixture {
+struct FixtureBase {
// TODO factor out Slobrok code to avoid need to set up live ports for unrelated tests
mbus::Slobrok slobrok;
vdstestlib::DirConfig config;
MockOperationEnqueuer enqueuer;
std::unique_ptr<FNetListener> fnet_listener;
- SlimeClusterStateBundleCodec codec;
DummyReturnHandler return_handler;
bool request_is_detached{false};
FRT_RPCRequest* bound_request{nullptr};
- Fixture() : config(getStandardConfig(true)) {
+ FixtureBase()
+ : config(getStandardConfig(true))
+ {
config.getConfig("stor-server").set("node_index", "1");
addSlobrokConfig(config, slobrok);
fnet_listener = std::make_unique<FNetListener>(enqueuer, config.getConfigId(), 0);
}
- ~Fixture() {
+ virtual ~FixtureBase() {
// Must destroy any associated message contexts that may have refs to FRT_Request
// instance _before_ we destroy the request itself.
enqueuer._enqueued.clear();
@@ -79,6 +86,12 @@ struct Fixture {
bound_request->SubRef();
}
}
+};
+
+struct SetStateFixture : FixtureBase {
+ SlimeClusterStateBundleCodec codec;
+
+ SetStateFixture() : FixtureBase() {}
void bind_request_params(EncodedClusterStateBundle& encoded_bundle, uint32_t uncompressed_length) {
bound_request = new FRT_RPCRequest(); // Naked new isn't pretty, but FRT_RPCRequest has internal refcounting
@@ -123,6 +136,10 @@ struct Fixture {
lib::ClusterStateBundle dummy_baseline_bundle() const {
return lib::ClusterStateBundle(lib::ClusterState("version:123 distributor:3 storage:3"));
}
+
+ lib::ClusterStateBundle dummy_baseline_bundle_with_deferred_activation(bool deferred) const {
+ return lib::ClusterStateBundle(lib::ClusterState("version:123 distributor:3 storage:3"), {}, deferred);
+ }
};
std::shared_ptr<const lib::ClusterState> state_of(vespalib::stringref state) {
@@ -138,17 +155,17 @@ vespalib::string make_compressable_state_string() {
ss.str().data(), ss.str().data());
}
-}
+} // anon namespace
void FNetListenerTest::baseline_set_distribution_states_rpc_enqueues_command_with_state_bundle() {
- Fixture f;
+ SetStateFixture f;
auto baseline = f.dummy_baseline_bundle();
f.assert_request_received_and_propagated(baseline);
}
void FNetListenerTest::set_distribution_states_rpc_with_derived_enqueues_command_with_state_bundle() {
- Fixture f;
+ SetStateFixture f;
lib::ClusterStateBundle spaces_bundle(
lib::ClusterState("version:123 distributor:3 storage:3"),
{{FixedBucketSpaces::default_space(), state_of("version:123 distributor:3 storage:3 .0.s:d")},
@@ -158,7 +175,7 @@ void FNetListenerTest::set_distribution_states_rpc_with_derived_enqueues_command
}
void FNetListenerTest::compressed_bundle_is_transparently_uncompressed() {
- Fixture f;
+ SetStateFixture f;
auto state_str = make_compressable_state_string();
lib::ClusterStateBundle compressable_bundle{lib::ClusterState(state_str)};
@@ -171,24 +188,73 @@ void FNetListenerTest::compressed_bundle_is_transparently_uncompressed() {
}
void FNetListenerTest::set_distribution_rpc_is_immediately_failed_if_listener_is_closed() {
- Fixture f;
+ SetStateFixture f;
f.create_request(f.dummy_baseline_bundle());
f.fnet_listener->close();
f.assert_request_returns_error_response(RPCRequestWrapper::ERR_NODE_SHUTTING_DOWN);
}
void FNetListenerTest::overly_large_uncompressed_bundle_size_parameter_returns_rpc_error() {
- Fixture f;
+ SetStateFixture f;
auto encoded_bundle = f.codec.encode(f.dummy_baseline_bundle());
f.bind_request_params(encoded_bundle, FNetListener::StateBundleMaxUncompressedSize + 1);
f.assert_request_returns_error_response(RPCRequestWrapper::ERR_BAD_REQUEST);
}
void FNetListenerTest::mismatching_uncompressed_bundle_size_parameter_returns_rpc_error() {
- Fixture f;
+ SetStateFixture f;
auto encoded_bundle = f.codec.encode(f.dummy_baseline_bundle());
f.bind_request_params(encoded_bundle, encoded_bundle._buffer->getDataLen() + 100);
f.assert_request_returns_error_response(RPCRequestWrapper::ERR_BAD_REQUEST);
}
+void FNetListenerTest::true_deferred_activation_flag_can_be_roundtrip_encoded() {
+ SetStateFixture f;
+ f.assert_request_received_and_propagated(f.dummy_baseline_bundle_with_deferred_activation(true));
+
+}
+
+void FNetListenerTest::false_deferred_activation_flag_can_be_roundtrip_encoded() {
+ SetStateFixture f;
+ f.assert_request_received_and_propagated(f.dummy_baseline_bundle_with_deferred_activation(false));
+}
+
+struct ActivateStateFixture : FixtureBase {
+ ActivateStateFixture() : FixtureBase() {}
+
+ void bind_request_params(uint32_t activate_version) {
+ bound_request = new FRT_RPCRequest(); // Naked new isn't pretty, but FRT_RPCRequest has internal refcounting
+ auto* params = bound_request->GetParams();
+ params->AddInt32(activate_version);
+
+ bound_request->SetDetachedPT(&request_is_detached);
+ bound_request->SetReturnHandler(&return_handler);
+ }
+
+ void create_request(uint32_t activate_version) {
+ // Only 1 request allowed per fixture due to lifetime handling snags
+ assert(bound_request == nullptr);
+ bind_request_params(activate_version);
+ }
+
+ void assert_enqueued_operation_has_activate_version(uint32_t version) {
+ CPPUNIT_ASSERT(bound_request != nullptr);
+ CPPUNIT_ASSERT(request_is_detached);
+ CPPUNIT_ASSERT_EQUAL(size_t(1), enqueuer._enqueued.size());
+ auto& state_request = dynamic_cast<const api::ActivateClusterStateVersionCommand&>(*enqueuer._enqueued[0]);
+ CPPUNIT_ASSERT_EQUAL(version, state_request.version());
+ }
+
+ void assert_request_received_and_propagated(uint32_t activate_version) {
+ create_request(activate_version);
+ fnet_listener->RPC_activateClusterStateVersion(bound_request);
+ assert_enqueued_operation_has_activate_version(activate_version);
+ }
+};
+
+void FNetListenerTest::activate_cluster_state_version_rpc_enqueues_command_with_version() {
+ ActivateStateFixture f;
+ f.assert_request_received_and_propagated(1234567);
+}
+
}
diff --git a/storage/src/tests/storageserver/statemanagertest.cpp b/storage/src/tests/storageserver/statemanagertest.cpp
index 19f414482db..cdf990fa28f 100644
--- a/storage/src/tests/storageserver/statemanagertest.cpp
+++ b/storage/src/tests/storageserver/statemanagertest.cpp
@@ -37,6 +37,7 @@ struct StateManagerTest : public CppUnit::TestFixture {
void can_explicitly_send_get_node_state_reply();
void explicit_node_state_replying_without_pending_request_immediately_replies_on_next_request();
void immediate_node_state_replying_is_tracked_per_controller();
+ void activation_command_is_bounced_with_current_cluster_state_version();
CPPUNIT_TEST_SUITE(StateManagerTest);
CPPUNIT_TEST(testSystemState);
@@ -45,8 +46,10 @@ struct StateManagerTest : public CppUnit::TestFixture {
CPPUNIT_TEST(can_explicitly_send_get_node_state_reply);
CPPUNIT_TEST(explicit_node_state_replying_without_pending_request_immediately_replies_on_next_request);
CPPUNIT_TEST(immediate_node_state_replying_is_tracked_per_controller);
+ CPPUNIT_TEST(activation_command_is_bounced_with_current_cluster_state_version);
CPPUNIT_TEST_SUITE_END();
+ void force_current_cluster_state_version(uint32_t version);
void mark_reported_node_state_up();
void send_down_get_node_state_request(uint16_t controller_index);
void assert_ok_get_node_state_reply_sent_and_clear();
@@ -101,6 +104,12 @@ StateManagerTest::tearDown() {
_metricManager.reset();
}
+void StateManagerTest::force_current_cluster_state_version(uint32_t version) {
+ ClusterState state(*_manager->getClusterStateBundle()->getBaselineClusterState());
+ state.setVersion(version);
+ _manager->setClusterStateBundle(lib::ClusterStateBundle(state));
+}
+
#define GET_ONLY_OK_REPLY(varname) \
{ \
CPPUNIT_ASSERT_EQUAL(size_t(1), _upper->getNumReplies()); \
@@ -236,9 +245,7 @@ StateManagerTest::testReportedNodeState()
}
void StateManagerTest::current_cluster_state_version_is_included_in_host_info_json() {
- ClusterState state(*_manager->getClusterStateBundle()->getBaselineClusterState());
- state.setVersion(123);
- _manager->setClusterStateBundle(lib::ClusterStateBundle(state));
+ force_current_cluster_state_version(123);
std::string nodeInfoString(_manager->getNodeInfo());
vespalib::Memory goldenMemory(nodeInfoString);
@@ -343,4 +350,21 @@ void StateManagerTest::immediate_node_state_replying_is_tracked_per_controller()
CPPUNIT_ASSERT_EQUAL(size_t(0), _upper->getNumReplies());
}
+void StateManagerTest::activation_command_is_bounced_with_current_cluster_state_version() {
+ force_current_cluster_state_version(12345);
+
+ auto cmd = std::make_shared<api::ActivateClusterStateVersionCommand>(12340);
+ cmd->setTimeout(10000000);
+ cmd->setSourceIndex(0);
+ _upper->sendDown(cmd);
+
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _upper->getNumReplies());
+ std::shared_ptr<api::StorageReply> reply;
+ GET_ONLY_OK_REPLY(reply); // Implicitly clears messages from _upper
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::ACTIVATE_CLUSTER_STATE_VERSION_REPLY, reply->getType());
+ auto& activate_reply = dynamic_cast<api::ActivateClusterStateVersionReply&>(*reply);
+ CPPUNIT_ASSERT_EQUAL(uint32_t(12340), activate_reply.activateVersion());
+ CPPUNIT_ASSERT_EQUAL(uint32_t(12345), activate_reply.actualVersion());
+}
+
} // storage
diff --git a/storage/src/vespa/storage/config/distributorconfiguration.cpp b/storage/src/vespa/storage/config/distributorconfiguration.cpp
index 44cf56fdff8..294ce56f536 100644
--- a/storage/src/vespa/storage/config/distributorconfiguration.cpp
+++ b/storage/src/vespa/storage/config/distributorconfiguration.cpp
@@ -34,6 +34,7 @@ DistributorConfiguration::DistributorConfiguration(StorageComponent& component)
_enableHostInfoReporting(true),
_disableBucketActivation(false),
_sequenceMutatingOperations(true),
+ _allowStaleReadsDuringClusterStateTransitions(false),
_minimumReplicaCountingMode(ReplicaCountingMode::TRUSTED)
{ }
@@ -144,6 +145,7 @@ DistributorConfiguration::configure(const vespa::config::content::core::StorDist
_enableHostInfoReporting = config.enableHostInfoReporting;
_disableBucketActivation = config.disableBucketActivation;
_sequenceMutatingOperations = config.sequenceMutatingOperations;
+ _allowStaleReadsDuringClusterStateTransitions = config.allowStaleReadsDuringClusterStateTransitions;
_minimumReplicaCountingMode = config.minimumReplicaCountingMode;
diff --git a/storage/src/vespa/storage/config/distributorconfiguration.h b/storage/src/vespa/storage/config/distributorconfiguration.h
index 5dfc4f66cb8..8c84fef47b5 100644
--- a/storage/src/vespa/storage/config/distributorconfiguration.h
+++ b/storage/src/vespa/storage/config/distributorconfiguration.h
@@ -235,6 +235,13 @@ public:
void setSequenceMutatingOperations(bool sequenceMutations) noexcept {
_sequenceMutatingOperations = sequenceMutations;
}
+
+ bool allowStaleReadsDuringClusterStateTransitions() const noexcept {
+ return _allowStaleReadsDuringClusterStateTransitions;
+ }
+ void setAllowStaleReadsDuringClusterStateTransitions(bool allow) noexcept {
+ _allowStaleReadsDuringClusterStateTransitions = allow;
+ }
private:
DistributorConfiguration(const DistributorConfiguration& other);
@@ -274,6 +281,7 @@ private:
bool _enableHostInfoReporting;
bool _disableBucketActivation;
bool _sequenceMutatingOperations;
+ bool _allowStaleReadsDuringClusterStateTransitions;
DistrConfig::MinimumReplicaCountingMode _minimumReplicaCountingMode;
diff --git a/storage/src/vespa/storage/config/stor-distributormanager.def b/storage/src/vespa/storage/config/stor-distributormanager.def
index 89aad427ca9..d4f69073cc6 100644
--- a/storage/src/vespa/storage/config/stor-distributormanager.def
+++ b/storage/src/vespa/storage/config/stor-distributormanager.def
@@ -184,3 +184,10 @@ sequence_mutating_operations bool default=true
## towards a node if it has indicated that its merge queues are full or it is
## suffering from resource exhaustion.
inhibit_merge_sending_on_busy_node_duration_sec int default=10
+
+## If set, enables potentially stale reads during cluster state transitions where
+## buckets change ownership. This also implicitly enables support for two-phase
+## cluster state transitions on the distributor.
+## For this option to take effect, the cluster controller must also have two-phase
+## states enabled.
+allow_stale_reads_during_cluster_state_transitions bool default=false
diff --git a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
index a223001af79..e9595b4a960 100644
--- a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
+++ b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
@@ -20,11 +20,12 @@ using document::BucketSpace;
namespace storage::distributor {
BucketDBUpdater::BucketDBUpdater(Distributor& owner,
- DistributorBucketSpaceRepo &bucketSpaceRepo,
+ DistributorBucketSpaceRepo& bucketSpaceRepo,
+ DistributorBucketSpaceRepo& readOnlyBucketSpaceRepo,
DistributorMessageSender& sender,
DistributorComponentRegister& compReg)
: framework::StatusReporter("bucketdb", "Bucket DB Updater"),
- _distributorComponent(owner, bucketSpaceRepo, compReg, "Bucket DB Updater"),
+ _distributorComponent(owner, bucketSpaceRepo, readOnlyBucketSpaceRepo, compReg, "Bucket DB Updater"),
_sender(sender),
_transitionTimer(_distributorComponent.getClock())
{
@@ -53,6 +54,13 @@ BucketDBUpdater::print(std::ostream& out, bool verbose, const std::string& inden
}
bool
+BucketDBUpdater::shouldDeferStateEnabling() const noexcept
+{
+ return _distributorComponent.getDistributor().getConfig()
+ .allowStaleReadsDuringClusterStateTransitions();
+}
+
+bool
BucketDBUpdater::hasPendingClusterState() const
{
return static_cast<bool>(_pendingClusterState);
@@ -113,25 +121,35 @@ void
BucketDBUpdater::removeSuperfluousBuckets(
const lib::ClusterStateBundle& newState)
{
+ const bool move_to_read_only_db = shouldDeferStateEnabling();
for (auto &elem : _distributorComponent.getBucketSpaceRepo()) {
const auto &newDistribution(elem.second->getDistribution());
const auto &oldClusterState(elem.second->getClusterState());
auto &bucketDb(elem.second->getBucketDatabase());
+ auto& readOnlyDb(_distributorComponent.getReadOnlyBucketSpaceRepo().get(elem.first).getBucketDatabase());
// Remove all buckets not belonging to this distributor, or
// being on storage nodes that are no longer up.
NodeRemover proc(
oldClusterState,
*newState.getDerivedClusterState(elem.first),
- _distributorComponent.getBucketIdFactory(),
_distributorComponent.getIndex(),
newDistribution,
_distributorComponent.getDistributor().getStorageNodeUpStates());
bucketDb.forEach(proc);
- for (const auto & entry :proc.getBucketsToRemove()) {
- bucketDb.remove(entry);
+ for (const auto & bucket : proc.getBucketsToRemove()) {
+ bucketDb.remove(bucket);
+ }
+ // TODO vec of Entry instead to avoid lookup and remove? Uses more transient memory...
+ for (const auto& bucket : proc.getNonOwnedBuckets()) {
+ if (move_to_read_only_db) {
+ auto db_entry = bucketDb.get(bucket);
+ readOnlyDb.update(db_entry); // TODO Entry move support
+ }
+ bucketDb.remove(bucket);
}
+
}
}
@@ -154,6 +172,14 @@ BucketDBUpdater::completeTransitionTimer()
}
void
+BucketDBUpdater::clearReadOnlyBucketRepoDatabases()
+{
+ for (auto& space : _distributorComponent.getReadOnlyBucketSpaceRepo()) {
+ space.second->getBucketDatabase().clear();
+ }
+}
+
+void
BucketDBUpdater::storageDistributionChanged()
{
ensureTransitionTimerStarted();
@@ -169,6 +195,7 @@ BucketDBUpdater::storageDistributionChanged()
std::move(clusterInfo),
_sender,
_distributorComponent.getBucketSpaceRepo(),
+ _distributorComponent.getReadOnlyBucketSpaceRepo(),
_distributorComponent.getUniqueTimestamp());
_outdatedNodesMap = _pendingClusterState->getOutdatedNodesMap();
}
@@ -176,14 +203,22 @@ BucketDBUpdater::storageDistributionChanged()
void
BucketDBUpdater::replyToPreviousPendingClusterStateIfAny()
{
- if (_pendingClusterState.get() &&
- _pendingClusterState->getCommand().get())
- {
+ if (_pendingClusterState.get() && _pendingClusterState->hasCommand()) {
_distributorComponent.sendUp(
std::make_shared<api::SetSystemStateReply>(*_pendingClusterState->getCommand()));
}
}
+void
+BucketDBUpdater::replyToActivationWithActualVersion(
+ const api::ActivateClusterStateVersionCommand& cmd,
+ uint32_t actualVersion)
+{
+ auto reply = std::make_shared<api::ActivateClusterStateVersionReply>(cmd);
+ reply->setActualVersion(actualVersion);
+ _distributorComponent.sendUp(reply); // TODO let API accept rvalues
+}
+
bool
BucketDBUpdater::onSetSystemState(
const std::shared_ptr<api::SetSystemStateCommand>& cmd)
@@ -214,6 +249,7 @@ BucketDBUpdater::onSetSystemState(
std::move(clusterInfo),
_sender,
_distributorComponent.getBucketSpaceRepo(),
+ _distributorComponent.getReadOnlyBucketSpaceRepo(),
cmd,
_outdatedNodesMap,
_distributorComponent.getUniqueTimestamp());
@@ -225,6 +261,39 @@ BucketDBUpdater::onSetSystemState(
return true;
}
+bool
+BucketDBUpdater::onActivateClusterStateVersion(const std::shared_ptr<api::ActivateClusterStateVersionCommand>& cmd)
+{
+ if (hasPendingClusterState() && _pendingClusterState->isVersionedTransition()) {
+ const auto pending_version = _pendingClusterState->clusterStateVersion();
+ if (pending_version == cmd->version()) {
+ if (isPendingClusterStateCompleted()) {
+ assert(_pendingClusterState->isDeferred());
+ activatePendingClusterState();
+ } else {
+ LOG(error, "Received cluster state activation for pending version %u "
+ "without pending state being complete yet. This is not expected, "
+ "as no activation should be sent before all distributors have "
+ "reported that state processing is complete.", pending_version);
+ replyToActivationWithActualVersion(*cmd, 0); // Invalid version, will cause re-send (hopefully when completed).
+ return true;
+ }
+ } else {
+ replyToActivationWithActualVersion(*cmd, pending_version);
+ return true;
+ }
+ } else if (shouldDeferStateEnabling()) {
+ // Likely just a resend, but log warn for now to get a feel of how common it is.
+ LOG(warning, "Received cluster state activation command for version %u, which "
+ "has no corresponding pending state. Likely resent operation.", cmd->version());
+ } else {
+ LOG(debug, "Received cluster state activation command for version %u, but distributor "
+ "config does not have deferred activation enabled. Treating as no-op.", cmd->version());
+ }
+ // Fall through to next link in call chain that cares about this message.
+ return false;
+}
+
BucketDBUpdater::MergeReplyGuard::~MergeReplyGuard()
{
if (_reply) {
@@ -485,14 +554,45 @@ BucketDBUpdater::isPendingClusterStateCompleted() const
void
BucketDBUpdater::processCompletedPendingClusterState()
{
+ if (_pendingClusterState->isDeferred()) {
+ LOG(debug, "Deferring completion of pending cluster state version %u until explicitly activated",
+ _pendingClusterState->clusterStateVersion());
+ assert(_pendingClusterState->hasCommand()); // Deferred transitions should only ever be created by state commands.
+ // Sending down SetSystemState command will reach the state manager and a reply
+ // will be auto-sent back to the cluster controller in charge. Once this happens,
+ // it will send an explicit activation command once all distributors have reported
+ // that their pending cluster states have completed.
+ // A booting distributor will treat itself as "system Up" before the state has actually
+ // taken effect via activation. External operation handler will keep operations from
+ // actually being scheduled until state has been activated. The external operation handler
+ // needs to be explicitly aware of the case where no state has yet to be activated.
+ _distributorComponent.getDistributor().getMessageSender().sendDown(
+ _pendingClusterState->getCommand());
+ _pendingClusterState->clearCommand();
+ return;
+ }
+ // Distribution config change or non-deferred cluster state. Immediately activate
+ // the pending state without being told to do so explicitly.
+ activatePendingClusterState();
+}
+
+void
+BucketDBUpdater::activatePendingClusterState()
+{
_pendingClusterState->mergeIntoBucketDatabases();
- if (_pendingClusterState->getCommand().get()) {
+ if (_pendingClusterState->isVersionedTransition()) {
+ LOG(debug, "Activating pending cluster state version %u", _pendingClusterState->clusterStateVersion());
enableCurrentClusterStateBundleInDistributor();
- _distributorComponent.getDistributor().getMessageSender().sendDown(
- _pendingClusterState->getCommand());
+ if (_pendingClusterState->hasCommand()) {
+ _distributorComponent.getDistributor().getMessageSender().sendDown(
+ _pendingClusterState->getCommand());
+ }
addCurrentStateToClusterStateHistory();
} else {
+ LOG(debug, "Activating pending distribution config");
+ // TODO distribution changes cannot currently be deferred as they are not
+ // initiated by the cluster controller!
_distributorComponent.getDistributor().notifyDistributionChangeEnabled();
}
@@ -500,13 +600,14 @@ BucketDBUpdater::processCompletedPendingClusterState()
_outdatedNodesMap.clear();
sendAllQueuedBucketRechecks();
completeTransitionTimer();
+ clearReadOnlyBucketRepoDatabases();
}
void
BucketDBUpdater::enableCurrentClusterStateBundleInDistributor()
{
const lib::ClusterStateBundle& state(
- _pendingClusterState->getCommand()->getClusterStateBundle());
+ _pendingClusterState->getNewClusterStateBundle());
LOG(debug,
"BucketDBUpdater finished processing state %s",
@@ -688,7 +789,7 @@ BucketDBUpdater::NodeRemover::process(BucketDatabase::Entry& e)
return true;
}
if (!distributorOwnsBucket(bucketId)) {
- _removedBuckets.push_back(bucketId);
+ _nonOwnedBuckets.push_back(bucketId);
return true;
}
diff --git a/storage/src/vespa/storage/distributor/bucketdbupdater.h b/storage/src/vespa/storage/distributor/bucketdbupdater.h
index ea67e7ea72a..393e1e2524e 100644
--- a/storage/src/vespa/storage/distributor/bucketdbupdater.h
+++ b/storage/src/vespa/storage/distributor/bucketdbupdater.h
@@ -33,7 +33,8 @@ public:
using OutdatedNodes = dbtransition::OutdatedNodes;
using OutdatedNodesMap = dbtransition::OutdatedNodesMap;
BucketDBUpdater(Distributor& owner,
- DistributorBucketSpaceRepo &bucketSpaceRepo,
+ DistributorBucketSpaceRepo& bucketSpaceRepo,
+ DistributorBucketSpaceRepo& readOnlyBucketSpaceRepo,
DistributorMessageSender& sender,
DistributorComponentRegister& compReg);
~BucketDBUpdater();
@@ -43,6 +44,7 @@ public:
void recheckBucketInfo(uint32_t nodeIdx, const document::Bucket& bucket);
bool onSetSystemState(const std::shared_ptr<api::SetSystemStateCommand>& cmd) override;
+ bool onActivateClusterStateVersion(const std::shared_ptr<api::ActivateClusterStateVersionCommand>& cmd) override;
bool onRequestBucketInfoReply(const std::shared_ptr<api::RequestBucketInfoReply> & repl) override;
bool onMergeBucketReply(const std::shared_ptr<api::MergeBucketReply>& reply) override;
bool onNotifyBucketChange(const std::shared_ptr<api::NotifyBucketChangeCommand>&) override;
@@ -124,6 +126,7 @@ private:
}
};
+ bool shouldDeferStateEnabling() const noexcept;
bool hasPendingClusterState() const;
bool pendingClusterStateAccepted(const std::shared_ptr<api::RequestBucketInfoReply>& repl);
bool processSingleBucketInfoReply(const std::shared_ptr<api::RequestBucketInfoReply>& repl);
@@ -131,6 +134,7 @@ private:
const BucketRequest& req);
bool isPendingClusterStateCompleted() const;
void processCompletedPendingClusterState();
+ void activatePendingClusterState();
void mergeBucketInfoWithDatabase(const std::shared_ptr<api::RequestBucketInfoReply>& repl,
const BucketRequest& req);
void convertBucketInfoToBucketList(const std::shared_ptr<api::RequestBucketInfoReply>& repl,
@@ -141,6 +145,7 @@ private:
BucketListMerger::BucketList& existing) const;
void ensureTransitionTimerStarted();
void completeTransitionTimer();
+ void clearReadOnlyBucketRepoDatabases();
/**
* Adds all buckets contained in the bucket database
* that are either contained
@@ -161,6 +166,9 @@ private:
void removeSuperfluousBuckets(const lib::ClusterStateBundle& newState);
void replyToPreviousPendingClusterStateIfAny();
+ void replyToActivationWithActualVersion(
+ const api::ActivateClusterStateVersionCommand& cmd,
+ uint32_t actualVersion);
void enableCurrentClusterStateBundleInDistributor();
void addCurrentStateToClusterStateHistory();
@@ -191,30 +199,35 @@ private:
public:
NodeRemover(const lib::ClusterState& oldState,
const lib::ClusterState& s,
- [[maybe_unused]] const document::BucketIdFactory& factory,
uint16_t localIndex,
const lib::Distribution& distribution,
const char* upStates)
: _oldState(oldState),
_state(s),
+ _nonOwnedBuckets(),
+ _removedBuckets(),
_localIndex(localIndex),
_distribution(distribution),
_upStates(upStates) {}
- ~NodeRemover();
+ ~NodeRemover() override;
bool process(BucketDatabase::Entry& e) override;
void logRemove(const document::BucketId& bucketId, const char* msg) const;
bool distributorOwnsBucket(const document::BucketId&) const;
- const std::vector<document::BucketId>& getBucketsToRemove() const {
+ const std::vector<document::BucketId>& getBucketsToRemove() const noexcept {
return _removedBuckets;
}
+ const std::vector<document::BucketId>& getNonOwnedBuckets() const noexcept {
+ return _nonOwnedBuckets;
+ }
private:
void setCopiesInEntry(BucketDatabase::Entry& e, const std::vector<BucketCopy>& copies) const;
void removeEmptyBucket(const document::BucketId& bucketId);
const lib::ClusterState _oldState;
const lib::ClusterState _state;
+ std::vector<document::BucketId> _nonOwnedBuckets;
std::vector<document::BucketId> _removedBuckets;
uint16_t _localIndex;
diff --git a/storage/src/vespa/storage/distributor/distributor.cpp b/storage/src/vespa/storage/distributor/distributor.cpp
index 1664dd0d9a1..c92dfbdc14e 100644
--- a/storage/src/vespa/storage/distributor/distributor.cpp
+++ b/storage/src/vespa/storage/distributor/distributor.cpp
@@ -67,15 +67,16 @@ Distributor::Distributor(DistributorComponentRegister& compReg,
_compReg(compReg),
_component(compReg, "distributor"),
_bucketSpaceRepo(std::make_unique<DistributorBucketSpaceRepo>()),
+ _readOnlyBucketSpaceRepo(std::make_unique<DistributorBucketSpaceRepo>()),
_metrics(new DistributorMetricSet(_component.getLoadTypes()->getMetricLoadTypes())),
_operationOwner(*this, _component.getClock()),
_maintenanceOperationOwner(*this, _component.getClock()),
_pendingMessageTracker(compReg),
- _bucketDBUpdater(*this, *_bucketSpaceRepo, *this, compReg),
+ _bucketDBUpdater(*this, *_bucketSpaceRepo, *_readOnlyBucketSpaceRepo, *this, compReg),
_distributorStatusDelegate(compReg, *this, *this),
_bucketDBStatusDelegate(compReg, *this, _bucketDBUpdater),
- _idealStateManager(*this, *_bucketSpaceRepo, compReg, manageActiveBucketCopies),
- _externalOperationHandler(*this, *_bucketSpaceRepo, _idealStateManager, compReg),
+ _idealStateManager(*this, *_bucketSpaceRepo, *_readOnlyBucketSpaceRepo, compReg, manageActiveBucketCopies),
+ _externalOperationHandler(*this, *_bucketSpaceRepo, *_readOnlyBucketSpaceRepo, _idealStateManager, compReg),
_threadPool(threadPool),
_initializingIsUp(true),
_doneInitializeHandler(doneInitHandler),
@@ -575,16 +576,20 @@ void
Distributor::propagateDefaultDistribution(
std::shared_ptr<const lib::Distribution> distribution)
{
- _bucketSpaceRepo->get(document::FixedBucketSpaces::default_space()).setDistribution(distribution);
auto global_distr = GlobalBucketSpaceDistributionConverter::convert_to_global(*distribution);
- _bucketSpaceRepo->get(document::FixedBucketSpaces::global_space()).setDistribution(std::move(global_distr));
+ for (auto* repo : {_bucketSpaceRepo.get(), _readOnlyBucketSpaceRepo.get()}) {
+ repo->get(document::FixedBucketSpaces::default_space()).setDistribution(distribution);
+ repo->get(document::FixedBucketSpaces::global_space()).setDistribution(global_distr);
+ }
}
void
Distributor::propagateClusterStates()
{
- for (auto &iter : *_bucketSpaceRepo) {
- iter.second->setClusterState(_clusterStateBundle.getDerivedClusterState(iter.first));
+ for (auto* repo : {_bucketSpaceRepo.get(), _readOnlyBucketSpaceRepo.get()}) {
+ for (auto& iter : *repo) {
+ iter.second->setClusterState(_clusterStateBundle.getDerivedClusterState(iter.first));
+ }
}
}
diff --git a/storage/src/vespa/storage/distributor/distributor.h b/storage/src/vespa/storage/distributor/distributor.h
index fb8a9fb4299..cd24b91eba2 100644
--- a/storage/src/vespa/storage/distributor/distributor.h
+++ b/storage/src/vespa/storage/distributor/distributor.h
@@ -158,6 +158,13 @@ public:
DistributorBucketSpaceRepo &getBucketSpaceRepo() noexcept { return *_bucketSpaceRepo; }
const DistributorBucketSpaceRepo &getBucketSpaceRepo() const noexcept { return *_bucketSpaceRepo; }
+ DistributorBucketSpaceRepo& getReadOnlyBucketSpaceRepo() noexcept {
+ return *_readOnlyBucketSpaceRepo;
+ }
+ const DistributorBucketSpaceRepo& getReadyOnlyBucketSpaceRepo() const noexcept {
+ return *_readOnlyBucketSpaceRepo;
+ }
+
private:
friend class Distributor_Test;
friend class BucketDBUpdaterTest;
@@ -244,6 +251,10 @@ private:
DistributorComponentRegister& _compReg;
storage::DistributorComponent _component;
std::unique_ptr<DistributorBucketSpaceRepo> _bucketSpaceRepo;
+ // Read-only bucket space repo with DBs that only contain buckets transiently
+ // during cluster state transitions. Bucket set does not overlap that of _bucketSpaceRepo
+ // and the DBs are empty during non-transition phases.
+ std::unique_ptr<DistributorBucketSpaceRepo> _readOnlyBucketSpaceRepo;
std::shared_ptr<DistributorMetricSet> _metrics;
OperationOwner _operationOwner;
diff --git a/storage/src/vespa/storage/distributor/distributorcomponent.cpp b/storage/src/vespa/storage/distributor/distributorcomponent.cpp
index d3d07350d35..9bd215b9644 100644
--- a/storage/src/vespa/storage/distributor/distributorcomponent.cpp
+++ b/storage/src/vespa/storage/distributor/distributorcomponent.cpp
@@ -15,16 +15,18 @@ namespace storage::distributor {
DistributorComponent::DistributorComponent(
DistributorInterface& distributor,
- DistributorBucketSpaceRepo &bucketSpaceRepo,
+ DistributorBucketSpaceRepo& bucketSpaceRepo,
+ DistributorBucketSpaceRepo& readOnlyBucketSpaceRepo,
DistributorComponentRegister& compReg,
const std::string& name)
: storage::DistributorComponent(compReg, name),
_distributor(distributor),
- _bucketSpaceRepo(bucketSpaceRepo)
+ _bucketSpaceRepo(bucketSpaceRepo),
+ _readOnlyBucketSpaceRepo(readOnlyBucketSpaceRepo)
{
}
-DistributorComponent::~DistributorComponent() {}
+DistributorComponent::~DistributorComponent() = default;
void
DistributorComponent::sendDown(const api::StorageMessage::SP& msg)
diff --git a/storage/src/vespa/storage/distributor/distributorcomponent.h b/storage/src/vespa/storage/distributor/distributorcomponent.h
index 561904cee8d..f2aea89d47c 100644
--- a/storage/src/vespa/storage/distributor/distributorcomponent.h
+++ b/storage/src/vespa/storage/distributor/distributorcomponent.h
@@ -29,11 +29,12 @@ class DistributorComponent : public storage::DistributorComponent
{
public:
DistributorComponent(DistributorInterface& distributor,
- DistributorBucketSpaceRepo &bucketSpaceRepo,
- DistributorComponentRegister& compReg,
- const std::string& name);
+ DistributorBucketSpaceRepo& bucketSpaceRepo,
+ DistributorBucketSpaceRepo& readOnlyBucketSpaceRepo,
+ DistributorComponentRegister& compReg,
+ const std::string& name);
- ~DistributorComponent();
+ ~DistributorComponent() override;
/**
* Returns the ownership status of a bucket as decided with the given
@@ -153,6 +154,9 @@ public:
DistributorBucketSpaceRepo &getBucketSpaceRepo() { return _bucketSpaceRepo; }
const DistributorBucketSpaceRepo &getBucketSpaceRepo() const { return _bucketSpaceRepo; }
+ DistributorBucketSpaceRepo& getReadOnlyBucketSpaceRepo() { return _readOnlyBucketSpaceRepo; }
+ const DistributorBucketSpaceRepo& getReadOnlyBucketSpaceRepo() const { return _readOnlyBucketSpaceRepo; }
+
/**
* Finds a bucket that has the same direct parent as the given bucket
* (i.e. split one bit less), but different bit in the most used bit.
@@ -179,7 +183,8 @@ private:
protected:
- DistributorBucketSpaceRepo &_bucketSpaceRepo;
+ DistributorBucketSpaceRepo& _bucketSpaceRepo;
+ DistributorBucketSpaceRepo& _readOnlyBucketSpaceRepo;
vespalib::Lock _sync;
};
diff --git a/storage/src/vespa/storage/distributor/distributormetricsset.cpp b/storage/src/vespa/storage/distributor/distributormetricsset.cpp
index 927dc06182d..83923a1f00e 100644
--- a/storage/src/vespa/storage/distributor/distributormetricsset.cpp
+++ b/storage/src/vespa/storage/distributor/distributormetricsset.cpp
@@ -17,7 +17,7 @@ DistributorMetricSet::DistributorMetricSet(const metrics::LoadTypeSet& lt)
removelocations(lt, PersistenceOperationMetricSet("removelocations"), this),
gets(lt, PersistenceOperationMetricSet("gets"), this),
stats(lt, PersistenceOperationMetricSet("stats"), this),
- multioperations(lt, PersistenceOperationMetricSet("multioperations"), this),
+ getbucketlists(lt, PersistenceOperationMetricSet("getbucketlists"), this),
visits(lt, VisitorMetricSet(), this),
stateTransitionTime("state_transition_time", {},
"Time it takes to complete a cluster state transition. If a "
diff --git a/storage/src/vespa/storage/distributor/distributormetricsset.h b/storage/src/vespa/storage/distributor/distributormetricsset.h
index 5a64027f500..dfe976a89ab 100644
--- a/storage/src/vespa/storage/distributor/distributormetricsset.h
+++ b/storage/src/vespa/storage/distributor/distributormetricsset.h
@@ -20,7 +20,7 @@ public:
metrics::LoadMetric<PersistenceOperationMetricSet> removelocations;
metrics::LoadMetric<PersistenceOperationMetricSet> gets;
metrics::LoadMetric<PersistenceOperationMetricSet> stats;
- metrics::LoadMetric<PersistenceOperationMetricSet> multioperations;
+ metrics::LoadMetric<PersistenceOperationMetricSet> getbucketlists;
metrics::LoadMetric<VisitorMetricSet> visits;
metrics::DoubleAverageMetric stateTransitionTime;
metrics::DoubleAverageMetric recoveryModeTime;
diff --git a/storage/src/vespa/storage/distributor/externaloperationhandler.cpp b/storage/src/vespa/storage/distributor/externaloperationhandler.cpp
index b22592af327..1b88f02cac6 100644
--- a/storage/src/vespa/storage/distributor/externaloperationhandler.cpp
+++ b/storage/src/vespa/storage/distributor/externaloperationhandler.cpp
@@ -20,14 +20,18 @@
#include "distributor_bucket_space.h"
#include <vespa/log/log.h>
+#include <vespa/document/bucket/fixed_bucket_spaces.h>
+
LOG_SETUP(".distributor.manager");
namespace storage::distributor {
-ExternalOperationHandler::ExternalOperationHandler(Distributor& owner, DistributorBucketSpaceRepo& bucketSpaceRepo,
+ExternalOperationHandler::ExternalOperationHandler(Distributor& owner,
+ DistributorBucketSpaceRepo& bucketSpaceRepo,
+ DistributorBucketSpaceRepo& readOnlyBucketSpaceRepo,
const MaintenanceOperationGenerator& gen,
DistributorComponentRegister& compReg)
- : DistributorComponent(owner, bucketSpaceRepo, compReg, "External operation handler"),
+ : DistributorComponent(owner, bucketSpaceRepo, readOnlyBucketSpaceRepo, compReg, "External operation handler"),
_operationGenerator(gen),
_rejectFeedBeforeTimeReached() // At epoch
{ }
@@ -68,19 +72,69 @@ ExternalOperationHandler::checkSafeTimeReached(api::StorageCommand& cmd)
return true;
}
+void ExternalOperationHandler::bounce_with_result(api::StorageCommand& cmd, const api::ReturnCode& result) {
+ api::StorageReply::UP reply(cmd.makeReply());
+ reply->setResult(result);
+ sendUp(std::shared_ptr<api::StorageMessage>(reply.release()));
+}
+
+void ExternalOperationHandler::bounce_with_wrong_distribution(api::StorageCommand& cmd) {
+ // Distributor ownership is equal across bucket spaces, so always send back default space state.
+ // This also helps client avoid getting confused by possibly observing different actual
+ // (derived) state strings for global/non-global document types for the same state version.
+ // Similarly, if we've yet to activate any version at all we send back BUSY instead
+ // of a suspiciously empty WrongDistributionReply.
+ // TOOD consider NOT_READY instead of BUSY once we're sure this won't cause any other issues.
+ const auto& cluster_state = _bucketSpaceRepo.get(document::FixedBucketSpaces::default_space()).getClusterState();
+ if (cluster_state.getVersion() != 0) {
+ auto cluster_state_str = cluster_state.toString();
+ LOG(debug, "Got message with wrong distribution, sending back state '%s'", cluster_state_str.c_str());
+ bounce_with_result(cmd, api::ReturnCode(api::ReturnCode::WRONG_DISTRIBUTION, cluster_state_str));
+ } else { // Only valid for empty startup state
+ LOG(debug, "Got message with wrong distribution, but no cluster state activated yet. Sending back BUSY");
+ bounce_with_result(cmd, api::ReturnCode(api::ReturnCode::BUSY, "No cluster state activated yet"));
+ }
+}
+
+void ExternalOperationHandler::bounce_with_busy_during_state_transition(
+ api::StorageCommand& cmd,
+ const lib::ClusterState& current_state,
+ const lib::ClusterState& pending_state)
+{
+ auto status_str = vespalib::make_string("Currently pending cluster state transition"
+ " from version %u to %u",
+ current_state.getVersion(), pending_state.getVersion());
+
+ api::StorageReply::UP reply(cmd.makeReply());
+ api::ReturnCode ret(api::ReturnCode::BUSY, status_str);
+ reply->setResult(ret);
+ sendUp(std::shared_ptr<api::StorageMessage>(reply.release()));
+}
+
bool
ExternalOperationHandler::checkTimestampMutationPreconditions(api::StorageCommand& cmd,
const document::BucketId &bucketId,
PersistenceOperationMetricSet& persistenceMetrics)
{
document::Bucket bucket(cmd.getBucket().getBucketSpace(), bucketId);
- if (!checkDistribution(cmd, bucket)) {
+ if (!ownsBucketInCurrentState(bucket)) {
LOG(debug, "Distributor manager received %s, bucket %s with wrong distribution",
cmd.toString().c_str(), bucket.toString().c_str());
-
+ bounce_with_wrong_distribution(cmd);
persistenceMetrics.failures.wrongdistributor.inc();
return false;
}
+
+ auto pending = getDistributor().checkOwnershipInPendingState(bucket);
+ if (!pending.isOwned()) {
+ // We return BUSY here instead of WrongDistributionReply to avoid clients potentially
+ // ping-ponging between cluster state versions during a state transition.
+ auto& current_state = _bucketSpaceRepo.get(document::FixedBucketSpaces::default_space()).getClusterState();
+ auto& pending_state = pending.getNonOwnedState();
+ bounce_with_busy_during_state_transition(cmd, current_state, pending_state);
+ return false;
+ }
+
if (!checkSafeTimeReached(cmd)) {
persistenceMetrics.failures.safe_time_not_reached.inc();
return false;
@@ -111,6 +165,35 @@ bool ExternalOperationHandler::allowMutation(const SequencingHandle& handle) con
return handle.valid();
}
+template <typename Func>
+void ExternalOperationHandler::bounce_or_invoke_read_only_op(
+ api::StorageCommand& cmd,
+ const document::Bucket& bucket,
+ PersistenceOperationMetricSet& metrics,
+ Func func)
+{
+ if (!ownsBucketInCurrentState(bucket)) {
+ LOG(debug, "Distributor manager received %s, bucket %s with wrong distribution",
+ cmd.toString().c_str(), bucket.toString().c_str());
+ bounce_with_wrong_distribution(cmd);
+ metrics.failures.wrongdistributor.inc();
+ return;
+ }
+
+ auto pending = getDistributor().checkOwnershipInPendingState(bucket);
+ if (pending.isOwned()) {
+ func(_bucketSpaceRepo);
+ } else {
+ if (getDistributor().getConfig().allowStaleReadsDuringClusterStateTransitions()) {
+ func(_readOnlyBucketSpaceRepo);
+ } else {
+ auto& current_state = _bucketSpaceRepo.get(document::FixedBucketSpaces::default_space()).getClusterState();
+ auto& pending_state = pending.getNonOwnedState();
+ bounce_with_busy_during_state_transition(cmd, current_state, pending_state);
+ }
+ }
+}
+
IMPL_MSG_COMMAND_H(ExternalOperationHandler, Put)
{
auto& metrics = getMetrics().puts[cmd->getLoadType()];
@@ -186,10 +269,8 @@ IMPL_MSG_COMMAND_H(ExternalOperationHandler, RemoveLocation)
RemoveLocationOperation::getBucketId(*this, *cmd, bid);
document::Bucket bucket(cmd->getBucket().getBucketSpace(), bid);
- if (!checkDistribution(*cmd, bucket)) {
- LOG(debug, "Distributor manager received %s with wrong distribution", cmd->toString().c_str());
-
- getMetrics().removelocations[cmd->getLoadType()].failures.wrongdistributor.inc();
+ auto& metrics = getMetrics().removelocations[cmd->getLoadType()];
+ if (!checkTimestampMutationPreconditions(*cmd, bucket.getBucketId(), metrics)) {
return true;
}
@@ -201,43 +282,38 @@ IMPL_MSG_COMMAND_H(ExternalOperationHandler, RemoveLocation)
IMPL_MSG_COMMAND_H(ExternalOperationHandler, Get)
{
document::Bucket bucket(cmd->getBucket().getBucketSpace(), getBucketId(cmd->getDocumentId()));
- if (!checkDistribution(*cmd, bucket)) {
- LOG(debug, "Distributor manager received get for %s, bucket %s with wrong distribution",
- cmd->getDocumentId().toString().c_str(), bucket.toString().c_str());
-
- getMetrics().gets[cmd->getLoadType()].failures.wrongdistributor.inc();
- return true;
- }
-
- _op = std::make_shared<GetOperation>(*this, _bucketSpaceRepo.get(cmd->getBucket().getBucketSpace()),
- cmd, getMetrics().gets[cmd->getLoadType()]);
+ auto& metrics = getMetrics().gets[cmd->getLoadType()];
+ bounce_or_invoke_read_only_op(*cmd, bucket, metrics, [&](auto& bucket_space_repo) {
+ _op = std::make_shared<GetOperation>(*this, bucket_space_repo.get(cmd->getBucket().getBucketSpace()),
+ cmd, metrics);
+ });
return true;
}
IMPL_MSG_COMMAND_H(ExternalOperationHandler, StatBucket)
{
- if (!checkDistribution(*cmd, cmd->getBucket())) {
- return true;
- }
- auto &distributorBucketSpace(_bucketSpaceRepo.get(cmd->getBucket().getBucketSpace()));
- _op = std::make_shared<StatBucketOperation>(*this, distributorBucketSpace, cmd);
+ auto& metrics = getMetrics().stats[cmd->getLoadType()];
+ bounce_or_invoke_read_only_op(*cmd, cmd->getBucket(), metrics, [&](auto& bucket_space_repo) {
+ auto& bucket_space = bucket_space_repo.get(cmd->getBucket().getBucketSpace());
+ _op = std::make_shared<StatBucketOperation>(*this, bucket_space, cmd);
+ });
return true;
}
IMPL_MSG_COMMAND_H(ExternalOperationHandler, GetBucketList)
{
- if (!checkDistribution(*cmd, cmd->getBucket())) {
- return true;
- }
- auto bucketSpace(cmd->getBucket().getBucketSpace());
- auto &distributorBucketSpace(_bucketSpaceRepo.get(bucketSpace));
- auto &bucketDatabase(distributorBucketSpace.getBucketDatabase());
- _op = std::make_shared<StatBucketListOperation>(bucketDatabase, _operationGenerator, getIndex(), cmd);
+ auto& metrics = getMetrics().getbucketlists[cmd->getLoadType()];
+ bounce_or_invoke_read_only_op(*cmd, cmd->getBucket(), metrics, [&](auto& bucket_space_repo) {
+ auto& bucket_space = bucket_space_repo.get(cmd->getBucket().getBucketSpace());
+ auto& bucket_database = bucket_space.getBucketDatabase();
+ _op = std::make_shared<StatBucketListOperation>(bucket_database, _operationGenerator, getIndex(), cmd);
+ });
return true;
}
IMPL_MSG_COMMAND_H(ExternalOperationHandler, CreateVisitor)
{
+ // TODO same handling as Gets (VisitorOperation needs to change)
const DistributorConfiguration& config(getDistributor().getConfig());
VisitorOperation::Config visitorConfig(config.getMinBucketsPerVisitor(), config.getMaxVisitorsPerNodePerClientVisitor());
auto &distributorBucketSpace(_bucketSpaceRepo.get(cmd->getBucket().getBucketSpace()));
diff --git a/storage/src/vespa/storage/distributor/externaloperationhandler.h b/storage/src/vespa/storage/distributor/externaloperationhandler.h
index c198fe30159..655feb5d00c 100644
--- a/storage/src/vespa/storage/distributor/externaloperationhandler.h
+++ b/storage/src/vespa/storage/distributor/externaloperationhandler.h
@@ -37,10 +37,11 @@ public:
ExternalOperationHandler(Distributor& owner,
DistributorBucketSpaceRepo& bucketSpaceRepo,
+ DistributorBucketSpaceRepo& readOnlyBucketSpaceRepo,
const MaintenanceOperationGenerator&,
DistributorComponentRegister& compReg);
- ~ExternalOperationHandler();
+ ~ExternalOperationHandler() override;
bool handleMessage(const std::shared_ptr<api::StorageMessage>& msg,
Operation::SP& operation);
@@ -55,6 +56,18 @@ private:
Operation::SP _op;
TimePoint _rejectFeedBeforeTimeReached;
+ template <typename Func>
+ void bounce_or_invoke_read_only_op(api::StorageCommand& cmd,
+ const document::Bucket& bucket,
+ PersistenceOperationMetricSet& metrics,
+ Func f);
+
+ void bounce_with_wrong_distribution(api::StorageCommand& cmd);
+ void bounce_with_busy_during_state_transition(api::StorageCommand& cmd,
+ const lib::ClusterState& current_state,
+ const lib::ClusterState& pending_state);
+ void bounce_with_result(api::StorageCommand& cmd, const api::ReturnCode& result);
+
bool checkSafeTimeReached(api::StorageCommand& cmd);
api::ReturnCode makeSafeTimeRejectionResult(TimePoint unsafeTime);
bool checkTimestampMutationPreconditions(
diff --git a/storage/src/vespa/storage/distributor/idealstatemanager.cpp b/storage/src/vespa/storage/distributor/idealstatemanager.cpp
index 77b924ad351..5a1ff31e2e7 100644
--- a/storage/src/vespa/storage/distributor/idealstatemanager.cpp
+++ b/storage/src/vespa/storage/distributor/idealstatemanager.cpp
@@ -26,11 +26,12 @@ namespace distributor {
IdealStateManager::IdealStateManager(
Distributor& owner,
DistributorBucketSpaceRepo& bucketSpaceRepo,
+ DistributorBucketSpaceRepo& readOnlyBucketSpaceRepo,
DistributorComponentRegister& compReg,
bool manageActiveBucketCopies)
: HtmlStatusReporter("idealstateman", "Ideal state manager"),
_metrics(new IdealStateMetricSet),
- _distributorComponent(owner, bucketSpaceRepo, compReg, "Ideal state manager"),
+ _distributorComponent(owner, bucketSpaceRepo, readOnlyBucketSpaceRepo, compReg, "Ideal state manager"),
_bucketSpaceRepo(bucketSpaceRepo)
{
_distributorComponent.registerStatusPage(*this);
diff --git a/storage/src/vespa/storage/distributor/idealstatemanager.h b/storage/src/vespa/storage/distributor/idealstatemanager.h
index c8be2a40ad7..3bb6d0dd757 100644
--- a/storage/src/vespa/storage/distributor/idealstatemanager.h
+++ b/storage/src/vespa/storage/distributor/idealstatemanager.h
@@ -37,6 +37,7 @@ public:
IdealStateManager(Distributor& owner,
DistributorBucketSpaceRepo& bucketSpaceRepo,
+ DistributorBucketSpaceRepo& readOnlyBucketSpaceRepo,
DistributorComponentRegister& compReg,
bool manageActiveBucketCopies);
diff --git a/storage/src/vespa/storage/distributor/operations/external/getoperation.h b/storage/src/vespa/storage/distributor/operations/external/getoperation.h
index 198c588dfd1..3936f13077e 100644
--- a/storage/src/vespa/storage/distributor/operations/external/getoperation.h
+++ b/storage/src/vespa/storage/distributor/operations/external/getoperation.h
@@ -34,6 +34,9 @@ public:
bool hasConsistentCopies() const;
+ // Exposed for unit testing. TODO feels a bit dirty :I
+ const DistributorBucketSpace& bucketSpace() const noexcept { return _bucketSpace; }
+
private:
class GroupId {
public:
diff --git a/storage/src/vespa/storage/distributor/pendingclusterstate.cpp b/storage/src/vespa/storage/distributor/pendingclusterstate.cpp
index 5f74a82c28a..6cba7084037 100644
--- a/storage/src/vespa/storage/distributor/pendingclusterstate.cpp
+++ b/storage/src/vespa/storage/distributor/pendingclusterstate.cpp
@@ -27,7 +27,8 @@ PendingClusterState::PendingClusterState(
const framework::Clock& clock,
const ClusterInformation::CSP& clusterInfo,
DistributorMessageSender& sender,
- DistributorBucketSpaceRepo &bucketSpaceRepo,
+ DistributorBucketSpaceRepo& bucketSpaceRepo,
+ DistributorBucketSpaceRepo& readOnlyBucketSpaceRepo,
const std::shared_ptr<api::SetSystemStateCommand>& newStateCmd,
const OutdatedNodesMap &outdatedNodesMap,
api::Timestamp creationTimestamp)
@@ -40,6 +41,9 @@ PendingClusterState::PendingClusterState(
_creationTimestamp(creationTimestamp),
_sender(sender),
_bucketSpaceRepo(bucketSpaceRepo),
+ _readOnlyBucketSpaceRepo(readOnlyBucketSpaceRepo),
+ _clusterStateVersion(_cmd->getClusterStateBundle().getVersion()),
+ _isVersionedTransition(true),
_bucketOwnershipTransfer(false),
_pendingTransitions()
{
@@ -51,7 +55,8 @@ PendingClusterState::PendingClusterState(
const framework::Clock& clock,
const ClusterInformation::CSP& clusterInfo,
DistributorMessageSender& sender,
- DistributorBucketSpaceRepo &bucketSpaceRepo,
+ DistributorBucketSpaceRepo& bucketSpaceRepo,
+ DistributorBucketSpaceRepo& readOnlyBucketSpaceRepo,
api::Timestamp creationTimestamp)
: _requestedNodes(clusterInfo->getStorageNodeCount()),
_prevClusterStateBundle(clusterInfo->getClusterStateBundle()),
@@ -61,6 +66,9 @@ PendingClusterState::PendingClusterState(
_creationTimestamp(creationTimestamp),
_sender(sender),
_bucketSpaceRepo(bucketSpaceRepo),
+ _readOnlyBucketSpaceRepo(readOnlyBucketSpaceRepo),
+ _clusterStateVersion(0),
+ _isVersionedTransition(false),
_bucketOwnershipTransfer(true),
_pendingTransitions()
{
diff --git a/storage/src/vespa/storage/distributor/pendingclusterstate.h b/storage/src/vespa/storage/distributor/pendingclusterstate.h
index b96ba8cbbd7..cedc0573381 100644
--- a/storage/src/vespa/storage/distributor/pendingclusterstate.h
+++ b/storage/src/vespa/storage/distributor/pendingclusterstate.h
@@ -45,15 +45,16 @@ public:
const framework::Clock& clock,
const ClusterInformation::CSP& clusterInfo,
DistributorMessageSender& sender,
- DistributorBucketSpaceRepo &bucketSpaceRepo,
+ DistributorBucketSpaceRepo& bucketSpaceRepo,
+ DistributorBucketSpaceRepo& readOnlyBucketSpaceRepo,
const std::shared_ptr<api::SetSystemStateCommand>& newStateCmd,
const OutdatedNodesMap &outdatedNodesMap,
api::Timestamp creationTimestamp)
{
- return std::unique_ptr<PendingClusterState>(
- new PendingClusterState(clock, clusterInfo, sender, bucketSpaceRepo, newStateCmd,
- outdatedNodesMap,
- creationTimestamp));
+ // Naked new due to private constructor
+ return std::unique_ptr<PendingClusterState>(new PendingClusterState(
+ clock, clusterInfo, sender, bucketSpaceRepo, readOnlyBucketSpaceRepo,
+ newStateCmd, outdatedNodesMap, creationTimestamp));
}
/**
@@ -64,16 +65,19 @@ public:
const framework::Clock& clock,
const ClusterInformation::CSP& clusterInfo,
DistributorMessageSender& sender,
- DistributorBucketSpaceRepo &bucketSpaceRepo,
+ DistributorBucketSpaceRepo& bucketSpaceRepo,
+ DistributorBucketSpaceRepo& readOnlyBucketSpaceRepo,
api::Timestamp creationTimestamp)
{
- return std::unique_ptr<PendingClusterState>(
- new PendingClusterState(clock, clusterInfo, sender, bucketSpaceRepo, creationTimestamp));
+ // Naked new due to private constructor
+ return std::unique_ptr<PendingClusterState>(new PendingClusterState(
+ clock, clusterInfo, sender, bucketSpaceRepo,
+ readOnlyBucketSpaceRepo, creationTimestamp));
}
PendingClusterState(const PendingClusterState &) = delete;
PendingClusterState & operator = (const PendingClusterState &) = delete;
- ~PendingClusterState();
+ ~PendingClusterState() override;
/**
* Adds the info from the reply to our list of information.
@@ -104,10 +108,31 @@ public:
return _bucketOwnershipTransfer;
}
+ bool hasCommand() const noexcept {
+ return (_cmd.get() != nullptr);
+ }
+
std::shared_ptr<api::SetSystemStateCommand> getCommand() {
return _cmd;
}
+ bool isVersionedTransition() const noexcept {
+ return _isVersionedTransition;
+ }
+
+ uint32_t clusterStateVersion() const noexcept {
+ return _clusterStateVersion;
+ }
+
+ bool isDeferred() const noexcept {
+ return (isVersionedTransition()
+ && _newClusterStateBundle.deferredActivation());
+ }
+
+ void clearCommand() {
+ _cmd.reset();
+ }
+
const lib::ClusterStateBundle& getNewClusterStateBundle() const {
return _newClusterStateBundle;
}
@@ -141,7 +166,8 @@ private:
const framework::Clock&,
const ClusterInformation::CSP& clusterInfo,
DistributorMessageSender& sender,
- DistributorBucketSpaceRepo &bucketSpaceRepo,
+ DistributorBucketSpaceRepo& bucketSpaceRepo,
+ DistributorBucketSpaceRepo& readOnlyBucketSpaceRepo,
const std::shared_ptr<api::SetSystemStateCommand>& newStateCmd,
const OutdatedNodesMap &outdatedNodesMap,
api::Timestamp creationTimestamp);
@@ -154,7 +180,8 @@ private:
const framework::Clock&,
const ClusterInformation::CSP& clusterInfo,
DistributorMessageSender& sender,
- DistributorBucketSpaceRepo &bucketSpaceRepo,
+ DistributorBucketSpaceRepo& bucketSpaceRepo,
+ DistributorBucketSpaceRepo& readOnlyBucketSpaceRepo,
api::Timestamp creationTimestamp);
struct BucketSpaceAndNode {
@@ -204,8 +231,10 @@ private:
api::Timestamp _creationTimestamp;
DistributorMessageSender& _sender;
- DistributorBucketSpaceRepo &_bucketSpaceRepo;
-
+ DistributorBucketSpaceRepo& _bucketSpaceRepo;
+ DistributorBucketSpaceRepo& _readOnlyBucketSpaceRepo;
+ uint32_t _clusterStateVersion;
+ bool _isVersionedTransition;
bool _bucketOwnershipTransfer;
std::unordered_map<document::BucketSpace, std::unique_ptr<PendingBucketSpaceDbTransition>, document::BucketSpace::hash> _pendingTransitions;
};
diff --git a/storage/src/vespa/storage/storageserver/bouncer.cpp b/storage/src/vespa/storage/storageserver/bouncer.cpp
index 0541c7322f1..fdbfd553315 100644
--- a/storage/src/vespa/storage/storageserver/bouncer.cpp
+++ b/storage/src/vespa/storage/storageserver/bouncer.cpp
@@ -235,6 +235,7 @@ Bouncer::onDown(const std::shared_ptr<api::StorageMessage>& msg)
case api::MessageType::SETNODESTATE_ID:
case api::MessageType::GETNODESTATE_ID:
case api::MessageType::SETSYSTEMSTATE_ID:
+ case api::MessageType::ACTIVATE_CLUSTER_STATE_VERSION_ID:
case api::MessageType::NOTIFYBUCKETCHANGE_ID:
// state commands are always ok
return false;
diff --git a/storage/src/vespa/storage/storageserver/communicationmanager.cpp b/storage/src/vespa/storage/storageserver/communicationmanager.cpp
index 7fb85ef0ecc..978d434847e 100644
--- a/storage/src/vespa/storage/storageserver/communicationmanager.cpp
+++ b/storage/src/vespa/storage/storageserver/communicationmanager.cpp
@@ -622,20 +622,25 @@ CommunicationManager::sendDirectRPCReply(
{
std::string requestName(request.getMethodName());
if (requestName == "getnodestate3") {
- api::GetNodeStateReply& gns(static_cast<api::GetNodeStateReply&>(*reply));
+ auto& gns(dynamic_cast<api::GetNodeStateReply&>(*reply));
std::ostringstream ns;
serializeNodeState(gns, ns, true, true, false);
request.addReturnString(ns.str().c_str());
request.addReturnString(gns.getNodeInfo().c_str());
LOGBP(debug, "Sending getnodestate3 reply with host info '%s'.", gns.getNodeInfo().c_str());
} else if (requestName == "getnodestate2") {
- api::GetNodeStateReply& gns(static_cast<api::GetNodeStateReply&>(*reply));
+ auto& gns(dynamic_cast<api::GetNodeStateReply&>(*reply));
std::ostringstream ns;
serializeNodeState(gns, ns, true, true, false);
request.addReturnString(ns.str().c_str());
LOGBP(debug, "Sending getnodestate2 reply with no host info.");
} else if (requestName == "setsystemstate2" || requestName == "setdistributionstates") {
// No data to return
+ } else if (requestName == "activate_cluster_state_version") {
+ auto& activate_reply(dynamic_cast<api::ActivateClusterStateVersionReply&>(*reply));
+ request.addReturnInt(activate_reply.actualVersion());
+ LOGBP(debug, "sending activate_cluster_state_version reply for version %u with actual version %u ",
+ activate_reply.activateVersion(), activate_reply.actualVersion());
} else {
request.addReturnInt(reply->getResult().getResult());
request.addReturnString(reply->getResult().getMessage().c_str());
diff --git a/storage/src/vespa/storage/storageserver/fnetlistener.cpp b/storage/src/vespa/storage/storageserver/fnetlistener.cpp
index e31bded772c..ec488b25714 100644
--- a/storage/src/vespa/storage/storageserver/fnetlistener.cpp
+++ b/storage/src/vespa/storage/storageserver/fnetlistener.cpp
@@ -92,6 +92,11 @@ FNetListener::initRPC()
rb.ParamDesc("uncompressedSize", "Uncompressed size for payload");
rb.ParamDesc("payload", "Binary Slime format payload");
//-------------------------------------------------------------------------
+ rb.DefineMethod("activate_cluster_state_version", "i", "i", FRT_METHOD(FNetListener::RPC_activateClusterStateVersion), this);
+ rb.MethodDesc("Explicitly activates an already prepared cluster state version");
+ rb.ParamDesc("activate_version", "Expected cluster state version to activate");
+ rb.ReturnDesc("actual_version", "Cluster state version that was prepared on the node prior to receiving RPC");
+ //-------------------------------------------------------------------------
rb.DefineMethod("getcurrenttime", "", "lis", FRT_METHOD(FNetListener::RPC_getCurrentTime), this);
rb.MethodDesc("Get current time on this node");
rb.ReturnDesc("seconds", "Current time in seconds since epoch");
@@ -203,6 +208,7 @@ void FNetListener::RPC_setDistributionStates(FRT_RPCRequest* req) {
req->SetError(RPCRequestWrapper::ERR_BAD_REQUEST, e.what());
return;
}
+ LOG(debug, "Got state bundle %s", state_bundle->toString().c_str());
// TODO add constructor taking in shared_ptr directly instead?
auto cmd = std::make_shared<api::SetSystemStateCommand>(*state_bundle);
@@ -211,4 +217,20 @@ void FNetListener::RPC_setDistributionStates(FRT_RPCRequest* req) {
detach_and_forward_to_enqueuer(std::move(cmd), req);
}
+void FNetListener::RPC_activateClusterStateVersion(FRT_RPCRequest* req) {
+ if (_closed) {
+ LOG(debug, "Not handling RPC call activate_cluster_state_version() as we have closed");
+ req->SetError(RPCRequestWrapper::ERR_NODE_SHUTTING_DOWN, "Node shutting down");
+ return;
+ }
+
+ const uint32_t activate_version = req->GetParams()->GetValue(0)._intval32;
+ auto cmd = std::make_shared<api::ActivateClusterStateVersionCommand>(activate_version);
+ cmd->setPriority(api::StorageMessage::VERYHIGH);
+
+ LOG(debug, "Got state activation request for version %u", activate_version);
+
+ detach_and_forward_to_enqueuer(std::move(cmd), req);
+}
+
}
diff --git a/storage/src/vespa/storage/storageserver/fnetlistener.h b/storage/src/vespa/storage/storageserver/fnetlistener.h
index abcba18e0be..2097be15491 100644
--- a/storage/src/vespa/storage/storageserver/fnetlistener.h
+++ b/storage/src/vespa/storage/storageserver/fnetlistener.h
@@ -26,6 +26,7 @@ public:
void RPC_setSystemState2(FRT_RPCRequest *req);
void RPC_getCurrentTime(FRT_RPCRequest *req);
void RPC_setDistributionStates(FRT_RPCRequest* req);
+ void RPC_activateClusterStateVersion(FRT_RPCRequest* req);
void registerHandle(vespalib::stringref handle);
void close();
diff --git a/storage/src/vespa/storage/storageserver/slime_cluster_state_bundle_codec.cpp b/storage/src/vespa/storage/storageserver/slime_cluster_state_bundle_codec.cpp
index 5b7e0ab4621..1f854bc724e 100644
--- a/storage/src/vespa/storage/storageserver/slime_cluster_state_bundle_codec.cpp
+++ b/storage/src/vespa/storage/storageserver/slime_cluster_state_bundle_codec.cpp
@@ -53,6 +53,9 @@ EncodedClusterStateBundle SlimeClusterStateBundleCodec::encode(
{
vespalib::Slime slime;
Cursor& root = slime.setObject();
+ if (bundle.deferredActivation()) {
+ root.setBool("deferred-activation", bundle.deferredActivation());
+ }
Cursor& states = root.setObject("states");
states.setString("baseline", serialize_state(*bundle.getBaselineClusterState()));
Cursor& spaces = states.setObject("spaces");
@@ -79,6 +82,7 @@ namespace {
static const Memory StatesField("states");
static const Memory BaselineField("baseline");
static const Memory SpacesField("spaces");
+static const Memory DeferredActivationField("deferred-activation");
struct StateInserter : vespalib::slime::ObjectTraverser {
lib::ClusterStateBundle::BucketSpaceStateMapping& _space_states;
@@ -118,8 +122,11 @@ std::shared_ptr<const lib::ClusterStateBundle> SlimeClusterStateBundleCodec::dec
lib::ClusterStateBundle::BucketSpaceStateMapping space_states;
StateInserter inserter(space_states);
spaces.traverse(inserter);
+
+ const bool deferred_activation = root[DeferredActivationField].asBool(); // Defaults to false if not set.
+
// TODO add shared_ptr constructor for baseline?
- return std::make_shared<lib::ClusterStateBundle>(baseline, std::move(space_states));
+ return std::make_shared<lib::ClusterStateBundle>(baseline, std::move(space_states), deferred_activation);
}
}
diff --git a/storage/src/vespa/storage/storageserver/statemanager.cpp b/storage/src/vespa/storage/storageserver/statemanager.cpp
index 95cb5dec696..af01a880fea 100644
--- a/storage/src/vespa/storage/storageserver/statemanager.cpp
+++ b/storage/src/vespa/storage/storageserver/statemanager.cpp
@@ -514,6 +514,19 @@ StateManager::onSetSystemState(
return true;
}
+bool
+StateManager::onActivateClusterStateVersion(
+ const std::shared_ptr<api::ActivateClusterStateVersionCommand>& cmd)
+{
+ auto reply = std::make_shared<api::ActivateClusterStateVersionReply>(*cmd);
+ {
+ vespalib::LockGuard lock(_stateLock);
+ reply->setActualVersion(_systemState ? _systemState->getVersion() : 0);
+ }
+ sendUp(reply);
+ return true;
+}
+
void
StateManager::run(framework::ThreadHandle& thread)
{
diff --git a/storage/src/vespa/storage/storageserver/statemanager.h b/storage/src/vespa/storage/storageserver/statemanager.h
index 0bacd41f6d9..57f0e02a136 100644
--- a/storage/src/vespa/storage/storageserver/statemanager.h
+++ b/storage/src/vespa/storage/storageserver/statemanager.h
@@ -137,6 +137,7 @@ private:
bool onGetNodeState(const std::shared_ptr<api::GetNodeStateCommand>&) override;
bool onSetSystemState(const std::shared_ptr<api::SetSystemStateCommand>&) override;
+ bool onActivateClusterStateVersion(const std::shared_ptr<api::ActivateClusterStateVersionCommand>&) override;
/**
* _stateLock MUST NOT be held while calling.
diff --git a/storageapi/src/vespa/storageapi/message/state.cpp b/storageapi/src/vespa/storageapi/message/state.cpp
index efa9a45764f..071dba16b91 100644
--- a/storageapi/src/vespa/storageapi/message/state.cpp
+++ b/storageapi/src/vespa/storageapi/message/state.cpp
@@ -12,6 +12,8 @@ IMPLEMENT_COMMAND(GetNodeStateCommand, GetNodeStateReply)
IMPLEMENT_REPLY(GetNodeStateReply)
IMPLEMENT_COMMAND(SetSystemStateCommand, SetSystemStateReply)
IMPLEMENT_REPLY(SetSystemStateReply)
+IMPLEMENT_COMMAND(ActivateClusterStateVersionCommand, ActivateClusterStateVersionReply)
+IMPLEMENT_REPLY(ActivateClusterStateVersionReply)
GetNodeStateCommand::GetNodeStateCommand(lib::NodeState::UP expectedState)
: StorageCommand(MessageType::GETNODESTATE),
@@ -102,5 +104,39 @@ SetSystemStateReply::print(std::ostream& out, bool verbose,
}
}
+ActivateClusterStateVersionCommand::ActivateClusterStateVersionCommand(uint32_t version)
+ : StorageCommand(MessageType::ACTIVATE_CLUSTER_STATE_VERSION),
+ _version(version)
+{
+}
+
+void ActivateClusterStateVersionCommand::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ out << "ActivateClusterStateVersionCommand(" << _version << ")";
+ if (verbose) {
+ out << " : ";
+ StorageCommand::print(out, verbose, indent);
+ }
+}
+
+ActivateClusterStateVersionReply::ActivateClusterStateVersionReply(const ActivateClusterStateVersionCommand& cmd)
+ : StorageReply(cmd),
+ _activateVersion(cmd.version()),
+ _actualVersion(0) // Must be set explicitly
+{
+}
+
+void ActivateClusterStateVersionReply::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ out << "ActivateClusterStateVersionReply(activate " << _activateVersion
+ << ", actual " << _actualVersion << ")";
+ if (verbose) {
+ out << " : ";
+ StorageReply::print(out, verbose, indent);
+ }
+}
+
} // api
} // storage
diff --git a/storageapi/src/vespa/storageapi/message/state.h b/storageapi/src/vespa/storageapi/message/state.h
index 4e5ad92b259..91c4707e7c1 100644
--- a/storageapi/src/vespa/storageapi/message/state.h
+++ b/storageapi/src/vespa/storageapi/message/state.h
@@ -93,4 +93,27 @@ public:
DECLARE_STORAGEREPLY(SetSystemStateReply, onSetSystemStateReply)
};
+class ActivateClusterStateVersionCommand : public StorageCommand {
+ uint32_t _version;
+public:
+ explicit ActivateClusterStateVersionCommand(uint32_t version);
+ uint32_t version() const noexcept { return _version; }
+ void print(std::ostream& out, bool verbose, const std::string& indent) const override;
+
+ DECLARE_STORAGECOMMAND(ActivateClusterStateVersionCommand, onActivateClusterStateVersion);
+};
+
+class ActivateClusterStateVersionReply : public StorageReply {
+ uint32_t _activateVersion;
+ uint32_t _actualVersion;
+public:
+ explicit ActivateClusterStateVersionReply(const ActivateClusterStateVersionCommand&);
+ uint32_t activateVersion() const noexcept { return _activateVersion; }
+ void setActualVersion(uint32_t version) noexcept { _actualVersion = version; }
+ uint32_t actualVersion() const noexcept { return _actualVersion; }
+ void print(std::ostream& out, bool verbose, const std::string& indent) const override;
+
+ DECLARE_STORAGEREPLY(ActivateClusterStateVersionReply, onActivateClusterStateVersionReply);
+};
+
}
diff --git a/storageapi/src/vespa/storageapi/messageapi/messagehandler.h b/storageapi/src/vespa/storageapi/messageapi/messagehandler.h
index a9c1dfb8f26..27ee509e859 100644
--- a/storageapi/src/vespa/storageapi/messageapi/messagehandler.h
+++ b/storageapi/src/vespa/storageapi/messageapi/messagehandler.h
@@ -50,6 +50,8 @@ class NotifyBucketChangeCommand;
class SetNodeStateCommand;
class GetNodeStateCommand;
class SetSystemStateCommand;
+class ActivateClusterStateVersionCommand;
+class ActivateClusterStateVersionReply;
class GetSystemStateCommand;
class GetBucketNodesCommand;
class BucketsAddedCommand;
@@ -276,6 +278,12 @@ public:
virtual bool onSetSystemStateReply(
const std::shared_ptr<api::SetSystemStateReply>&)
{ return false; }
+ virtual bool onActivateClusterStateVersion(
+ const std::shared_ptr<api::ActivateClusterStateVersionCommand>&)
+ { return false; }
+ virtual bool onActivateClusterStateVersionReply(
+ const std::shared_ptr<api::ActivateClusterStateVersionReply>&)
+ { return false; }
virtual bool onGetSystemState(
const std::shared_ptr<api::GetSystemStateCommand>&)
{ return false; }
diff --git a/storageapi/src/vespa/storageapi/messageapi/storagemessage.cpp b/storageapi/src/vespa/storageapi/messageapi/storagemessage.cpp
index bab475eea32..40422ce06c4 100644
--- a/storageapi/src/vespa/storageapi/messageapi/storagemessage.cpp
+++ b/storageapi/src/vespa/storageapi/messageapi/storagemessage.cpp
@@ -77,6 +77,8 @@ const MessageType MessageType::SETSYSTEMSTATE("Set system state", SETSYSTEMSTATE
const MessageType MessageType::SETSYSTEMSTATE_REPLY("Set system state reply", SETSYSTEMSTATE_REPLY_ID, &MessageType::SETSYSTEMSTATE);
const MessageType MessageType::GETSYSTEMSTATE("Get system state", GETSYSTEMSTATE_ID);
const MessageType MessageType::GETSYSTEMSTATE_REPLY("get system state reply", GETSYSTEMSTATE_REPLY_ID, &MessageType::GETSYSTEMSTATE);
+const MessageType MessageType::ACTIVATE_CLUSTER_STATE_VERSION("Activate cluster state version", ACTIVATE_CLUSTER_STATE_VERSION_ID);
+const MessageType MessageType::ACTIVATE_CLUSTER_STATE_VERSION_REPLY("Activate cluster state version reply", ACTIVATE_CLUSTER_STATE_VERSION_REPLY_ID, &MessageType::ACTIVATE_CLUSTER_STATE_VERSION);
const MessageType MessageType::GETBUCKETDIFF("GetBucketDiff", GETBUCKETDIFF_ID);
const MessageType MessageType::GETBUCKETDIFF_REPLY("GetBucketDiff reply", GETBUCKETDIFF_REPLY_ID, &MessageType::GETBUCKETDIFF);
const MessageType MessageType::APPLYBUCKETDIFF("ApplyBucketDiff", APPLYBUCKETDIFF_ID);
diff --git a/storageapi/src/vespa/storageapi/messageapi/storagemessage.h b/storageapi/src/vespa/storageapi/messageapi/storagemessage.h
index c9f6e737a47..8c2338a020c 100644
--- a/storageapi/src/vespa/storageapi/messageapi/storagemessage.h
+++ b/storageapi/src/vespa/storageapi/messageapi/storagemessage.h
@@ -149,6 +149,8 @@ public:
QUERYRESULT_REPLY_ID = 89,
SETBUCKETSTATE_ID = 94,
SETBUCKETSTATE_REPLY_ID = 95,
+ ACTIVATE_CLUSTER_STATE_VERSION_ID = 96,
+ ACTIVATE_CLUSTER_STATE_VERSION_REPLY_ID = 97,
MESSAGETYPE_MAX_ID
};
@@ -195,6 +197,8 @@ public:
static const MessageType SETSYSTEMSTATE_REPLY;
static const MessageType GETSYSTEMSTATE;
static const MessageType GETSYSTEMSTATE_REPLY;
+ static const MessageType ACTIVATE_CLUSTER_STATE_VERSION;
+ static const MessageType ACTIVATE_CLUSTER_STATE_VERSION_REPLY;
static const MessageType BUCKETSADDED;
static const MessageType BUCKETSADDED_REPLY;
static const MessageType BUCKETSREMOVED;
diff --git a/vdslib/src/vespa/vdslib/state/cluster_state_bundle.cpp b/vdslib/src/vespa/vdslib/state/cluster_state_bundle.cpp
index ed561d67f6d..ff633c02fad 100644
--- a/vdslib/src/vespa/vdslib/state/cluster_state_bundle.cpp
+++ b/vdslib/src/vespa/vdslib/state/cluster_state_bundle.cpp
@@ -4,18 +4,30 @@
#include "cluster_state_bundle.h"
#include "clusterstate.h"
#include <iostream>
+#include <sstream>
namespace storage::lib {
ClusterStateBundle::ClusterStateBundle(const ClusterState &baselineClusterState)
- : _baselineClusterState(std::make_shared<const ClusterState>(baselineClusterState))
+ : _baselineClusterState(std::make_shared<const ClusterState>(baselineClusterState)),
+ _deferredActivation(false)
{
}
ClusterStateBundle::ClusterStateBundle(const ClusterState& baselineClusterState,
BucketSpaceStateMapping derivedBucketSpaceStates)
: _baselineClusterState(std::make_shared<const ClusterState>(baselineClusterState)),
- _derivedBucketSpaceStates(std::move(derivedBucketSpaceStates))
+ _derivedBucketSpaceStates(std::move(derivedBucketSpaceStates)),
+ _deferredActivation(false)
+{
+}
+
+ClusterStateBundle::ClusterStateBundle(const ClusterState& baselineClusterState,
+ BucketSpaceStateMapping derivedBucketSpaceStates,
+ bool deferredActivation)
+ : _baselineClusterState(std::make_shared<const ClusterState>(baselineClusterState)),
+ _derivedBucketSpaceStates(std::move(derivedBucketSpaceStates)),
+ _deferredActivation(deferredActivation)
{
}
@@ -52,6 +64,9 @@ ClusterStateBundle::operator==(const ClusterStateBundle &rhs) const
if (_derivedBucketSpaceStates.size() != rhs._derivedBucketSpaceStates.size()) {
return false;
}
+ if (_deferredActivation != rhs._deferredActivation) {
+ return false;
+ }
// Can't do a regular operator== comparison since we must check equality
// of cluster state _values_, not their _pointers_.
for (auto& lhs_ds : _derivedBucketSpaceStates) {
@@ -64,6 +79,14 @@ ClusterStateBundle::operator==(const ClusterStateBundle &rhs) const
return true;
}
+std::string
+ClusterStateBundle::toString() const
+{
+ std::ostringstream os;
+ os << *this;
+ return os.str();
+}
+
std::ostream& operator<<(std::ostream& os, const ClusterStateBundle& bundle) {
os << "ClusterStateBundle('" << *bundle.getBaselineClusterState();
if (!bundle.getDerivedClusterStates().empty()) {
@@ -74,7 +97,11 @@ std::ostream& operator<<(std::ostream& os, const ClusterStateBundle& bundle) {
os << " '" << *ds.second;
}
}
- os << "')";
+ os << '\'';
+ if (bundle.deferredActivation()) {
+ os << " (deferred activation)";
+ }
+ os << ")";
return os;
}
diff --git a/vdslib/src/vespa/vdslib/state/cluster_state_bundle.h b/vdslib/src/vespa/vdslib/state/cluster_state_bundle.h
index a64416762b8..d0b052766ff 100644
--- a/vdslib/src/vespa/vdslib/state/cluster_state_bundle.h
+++ b/vdslib/src/vespa/vdslib/state/cluster_state_bundle.h
@@ -5,6 +5,7 @@
#include <vespa/document/bucket/bucketspace.h>
#include <unordered_map>
#include <iosfwd>
+#include <string>
namespace storage::lib {
@@ -24,10 +25,14 @@ public:
>;
std::shared_ptr<const ClusterState> _baselineClusterState;
BucketSpaceStateMapping _derivedBucketSpaceStates;
+ bool _deferredActivation;
public:
explicit ClusterStateBundle(const ClusterState &baselineClusterState);
ClusterStateBundle(const ClusterState& baselineClusterState,
BucketSpaceStateMapping derivedBucketSpaceStates);
+ ClusterStateBundle(const ClusterState& baselineClusterState,
+ BucketSpaceStateMapping derivedBucketSpaceStates,
+ bool deferredActivation);
~ClusterStateBundle();
const std::shared_ptr<const ClusterState> &getBaselineClusterState() const;
const std::shared_ptr<const ClusterState> &getDerivedClusterState(document::BucketSpace bucketSpace) const;
@@ -35,6 +40,8 @@ public:
return _derivedBucketSpaceStates;
}
uint32_t getVersion() const;
+ bool deferredActivation() const noexcept { return _deferredActivation; }
+ std::string toString() const;
bool operator==(const ClusterStateBundle &rhs) const;
bool operator!=(const ClusterStateBundle &rhs) const { return !operator==(rhs); }
};
diff --git a/vespa-testrunner-components/CMakeLists.txt b/vespa-testrunner-components/CMakeLists.txt
new file mode 100644
index 00000000000..fe2cb84b7bb
--- /dev/null
+++ b/vespa-testrunner-components/CMakeLists.txt
@@ -0,0 +1,3 @@
+install_java_artifact(vespa-testrunner-components)
+install_fat_java_artifact(vespa-testrunner-components)
+install_config_definition(src/main/resources/configdefinitions/test-runner.def test-runner.def)
diff --git a/vespa-testrunner-components/OWNERS b/vespa-testrunner-components/OWNERS
new file mode 100644
index 00000000000..134acfc20f3
--- /dev/null
+++ b/vespa-testrunner-components/OWNERS
@@ -0,0 +1 @@
+jvenstad
diff --git a/vespa-testrunner-components/README.md b/vespa-testrunner-components/README.md
new file mode 100644
index 00000000000..034ad95ac25
--- /dev/null
+++ b/vespa-testrunner-components/README.md
@@ -0,0 +1,4 @@
+# Vespa-testrunner-components
+
+Defines handler and component used by the vespa application that is deployed by the controller to
+run system/staging/production tests.
diff --git a/vespa-testrunner-components/pom.xml b/vespa-testrunner-components/pom.xml
new file mode 100644
index 00000000000..66bcd92df50
--- /dev/null
+++ b/vespa-testrunner-components/pom.xml
@@ -0,0 +1,91 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
+ xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ <modelVersion>4.0.0</modelVersion>
+ <groupId>com.yahoo.vespa.hosted</groupId>
+ <artifactId>vespa-testrunner-components</artifactId>
+ <packaging>container-plugin</packaging>
+
+ <parent>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>parent</artifactId>
+ <version>7-SNAPSHOT</version>
+ <relativePath>../parent/pom.xml</relativePath>
+ </parent>
+
+ <dependencies>
+ <dependency>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>container</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.fusesource.jansi</groupId>
+ <artifactId>jansi</artifactId>
+ <version>1.11</version>
+ </dependency>
+
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>build-helper-maven-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>attach-artifacts</id>
+ <phase>package</phase>
+ <goals>
+ <goal>attach-artifact</goal>
+ </goals>
+ <configuration>
+ <artifacts>
+ <artifact>
+ <file>target/${project.artifactId}-jar-with-dependencies.jar</file>
+ <type>jar</type>
+ <classifier>deploy</classifier>
+ </artifact>
+ </artifacts>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <configuration>
+ <environmentVariables>
+ <VESPA_HOME>/opt/vespa</VESPA_HOME>
+ </environmentVariables>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>bundle-plugin</artifactId>
+ <version>${project.version}</version>
+ <extensions>true</extensions>
+ <configuration>
+ <useCommonAssemblyIds>true</useCommonAssemblyIds>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+</project>
diff --git a/vespa-testrunner-components/src/main/java/com/yahoo/vespa/hosted/testrunner/PomXmlGenerator.java b/vespa-testrunner-components/src/main/java/com/yahoo/vespa/hosted/testrunner/PomXmlGenerator.java
new file mode 100644
index 00000000000..7a85eabe289
--- /dev/null
+++ b/vespa-testrunner-components/src/main/java/com/yahoo/vespa/hosted/testrunner/PomXmlGenerator.java
@@ -0,0 +1,110 @@
+package com.yahoo.vespa.hosted.testrunner;
+
+import com.yahoo.vespa.defaults.Defaults;
+
+import java.nio.file.Path;
+import java.util.List;
+import java.util.stream.Collectors;
+
+/**
+ * Generates a pom.xml file that sets up build profile to test against the provided
+ * jar artifacts.
+ *
+ * @author valerijf
+ */
+public class PomXmlGenerator {
+ private static final String PROPERTY_TEMPLATE =
+ " <%ARTIFACT_ID%.path>%JAR_PATH%</%ARTIFACT_ID%.path>\n";
+ private static final String TEST_ARTIFACT_GROUP_ID = "com.yahoo.vespa.testrunner.test";
+ private static final String DEPENDENCY_TEMPLATE =
+ " <dependency>\n" +
+ " <groupId>" + TEST_ARTIFACT_GROUP_ID + "</groupId>\n" +
+ " <artifactId>%ARTIFACT_ID%</artifactId>\n" +
+ " <scope>system</scope>\n" +
+ " <type>test-jar</type>\n" +
+ " <version>test</version>\n" +
+ " <systemPath>${%ARTIFACT_ID%.path}</systemPath>\n" +
+ " </dependency>\n";
+ private static final String DEPENDENCY_TO_SCAN_TEMPLATE =
+ " <dependency>" + TEST_ARTIFACT_GROUP_ID + ":%ARTIFACT_ID%</dependency>\n";
+ private static final String POM_XML_TEMPLATE =
+ "<?xml version=\"1.0\"?>\n" +
+ "<project xmlns=\"http://maven.apache.org/POM/4.0.0\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd\">\n" +
+ " <modelVersion>4.0.0</modelVersion>\n" +
+ " <groupId>com.yahoo.vespa</groupId>\n" +
+ " <artifactId>tester-application</artifactId>\n" +
+ " <version>1.0.0</version>\n" +
+ "\n" +
+ " <properties>\n" +
+ " <maven_version>4.12</maven_version>\n" +
+ " <surefire_version>2.22.0</surefire_version>\n" +
+ "%PROPERTIES%" +
+ " </properties>\n" +
+ "\n" +
+ " <dependencies>\n" +
+ " <dependency>\n" +
+ " <groupId>junit</groupId>\n" +
+ " <artifactId>junit</artifactId>\n" +
+ " <version>${maven_version}</version>\n" +
+ " <scope>test</scope>\n" +
+ " </dependency>\n" +
+ "%DEPENDENCIES%" +
+ " </dependencies>\n" +
+ "\n" +
+ " <build>\n" +
+ " <plugins>\n" +
+ " <plugin>\n" +
+ " <groupId>org.apache.maven.plugins</groupId>\n" +
+ " <artifactId>maven-surefire-plugin</artifactId>\n" +
+ " <version>${surefire_version}</version>\n" +
+ " <configuration>\n" +
+ " <dependenciesToScan>\n" +
+ "%DEPENDENCIES_TO_SCAN%" +
+ " </dependenciesToScan>\n" +
+ " <groups>%GROUPS%</groups>\n" +
+ " <excludedGroups>com.yahoo.vespa.tenant.systemtest.base.impl.EmptyExcludeGroup.class</excludedGroups>\n" +
+ " <excludes>\n" +
+ " <exclude>%GROUPS%</exclude>\n" +
+ " </excludes>\n" +
+ " <reportsDirectory>${env.TEST_DIR}</reportsDirectory>\n" +
+ " <redirectTestOutputToFile>false</redirectTestOutputToFile>\n" +
+ " <environmentVariables>\n" +
+ " <LD_LIBRARY_PATH>" + Defaults.getDefaults().underVespaHome("lib64") + "</LD_LIBRARY_PATH>\n" +
+ " </environmentVariables>\n" +
+ " </configuration>\n" +
+ " </plugin>\n" +
+ " <plugin>\n" +
+ " <groupId>org.apache.maven.plugins</groupId>\n" +
+ " <artifactId>maven-surefire-report-plugin</artifactId>\n" +
+ " <version>${surefire_version}</version>\n" +
+ " <configuration>\n" +
+ " <reportsDirectory>${env.TEST_DIR}</reportsDirectory>\n" +
+ " </configuration>\n" +
+ " </plugin>\n" +
+ " </plugins>\n" +
+ " </build>\n" +
+ "</project>\n";
+
+ static String generatePomXml(TestProfile testProfile, List<Path> artifacts, Path testArtifact) {
+ String properties = artifacts.stream()
+ .map(path -> PROPERTY_TEMPLATE
+ .replace("%ARTIFACT_ID%", path.getFileName().toString())
+ .replace("%JAR_PATH%", path.toString()))
+ .collect(Collectors.joining());
+ String dependencies = artifacts.stream()
+ .map(path -> DEPENDENCY_TEMPLATE
+ .replace("%ARTIFACT_ID%", path.getFileName().toString()))
+ .collect(Collectors.joining());
+ String dependenciesToScan =
+ DEPENDENCY_TO_SCAN_TEMPLATE
+ .replace("%ARTIFACT_ID%", testArtifact.getFileName().toString());
+
+ return POM_XML_TEMPLATE
+ .replace("%PROPERTIES%", properties)
+ .replace("%DEPENDENCIES_TO_SCAN%", dependenciesToScan)
+ .replace("%DEPENDENCIES%", dependencies)
+ .replace("%GROUPS%", testProfile.group());
+ }
+
+ private PomXmlGenerator() {}
+}
diff --git a/vespa-testrunner-components/src/main/java/com/yahoo/vespa/hosted/testrunner/TestProfile.java b/vespa-testrunner-components/src/main/java/com/yahoo/vespa/hosted/testrunner/TestProfile.java
new file mode 100644
index 00000000000..b7d3a06f30d
--- /dev/null
+++ b/vespa-testrunner-components/src/main/java/com/yahoo/vespa/hosted/testrunner/TestProfile.java
@@ -0,0 +1,29 @@
+package com.yahoo.vespa.hosted.testrunner;
+
+/**
+ * @author valerijf
+ * @author jvenstad
+ */
+enum TestProfile {
+
+ SYSTEM_TEST("com.yahoo.vespa.tenant.cd.SystemTest, com.yahoo.vespa.tenant.systemtest.base.SystemTest", true),
+ STAGING_TEST("com.yahoo.vespa.tenant.cd.StagingTest, com.yahoo.vespa.tenant.systemtest.base.StagingTest", true),
+ PRODUCTION_TEST("com.yahoo.vespa.tenant.cd.ProductionTest, com.yahoo.vespa.tenant.systemtest.base.ProductionTest", false);
+
+ private final String group;
+ private final boolean failIfNoTests;
+
+ TestProfile(String group, boolean failIfNoTests) {
+ this.group = group;
+ this.failIfNoTests = failIfNoTests;
+ }
+
+ String group() {
+ return group;
+ }
+
+ boolean failIfNoTests() {
+ return failIfNoTests;
+ }
+
+}
diff --git a/vespa-testrunner-components/src/main/java/com/yahoo/vespa/hosted/testrunner/TestRunner.java b/vespa-testrunner-components/src/main/java/com/yahoo/vespa/hosted/testrunner/TestRunner.java
new file mode 100644
index 00000000000..fb5dccc551d
--- /dev/null
+++ b/vespa-testrunner-components/src/main/java/com/yahoo/vespa/hosted/testrunner/TestRunner.java
@@ -0,0 +1,195 @@
+package com.yahoo.vespa.hosted.testrunner;
+
+import com.google.inject.Inject;
+import com.yahoo.vespa.defaults.Defaults;
+import org.fusesource.jansi.AnsiOutputStream;
+import org.fusesource.jansi.HtmlAnsiOutputStream;
+
+import java.io.BufferedOutputStream;
+import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.PrintStream;
+import java.io.UncheckedIOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Collection;
+import java.util.List;
+import java.util.SortedMap;
+import java.util.concurrent.ConcurrentSkipListMap;
+import java.util.function.Function;
+import java.util.logging.Level;
+import java.util.logging.LogRecord;
+import java.util.logging.Logger;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import static com.yahoo.log.LogLevel.ERROR;
+
+/**
+ * @author valerijf
+ * @author jvenstad
+ */
+public class TestRunner {
+
+ private static final Logger logger = Logger.getLogger(TestRunner.class.getName());
+ private static final Level HTML = new Level("html", 1) { };
+ private static final Path vespaHome = Paths.get(Defaults.getDefaults().vespaHome());
+ private static final String settingsXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
+ "<settings xmlns=\"http://maven.apache.org/SETTINGS/1.0.0\"\n" +
+ " xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n" +
+ " xsi:schemaLocation=\"http://maven.apache.org/SETTINGS/1.0.0 http://maven.apache.org/xsd/settings-1.0.0.xsd\">\n" +
+ " <mirrors>\n" +
+ " <mirror>\n" +
+ " <id>maven central</id>\n" +
+ " <mirrorOf>*</mirrorOf>\n" + // Use this for everything!
+ " <url>https://repo.maven.apache.org/maven2/</url>\n" +
+ " </mirror>\n" +
+ " </mirrors>\n" +
+ "</settings>";
+
+ private final Path artifactsPath;
+ private final Path testPath;
+ private final Path logFile;
+ private final Path configFile;
+ private final Path settingsFile;
+ private final Function<TestProfile, ProcessBuilder> testBuilder;
+ private final SortedMap<Long, LogRecord> log = new ConcurrentSkipListMap<>();
+
+ private volatile Status status = Status.NOT_STARTED;
+
+ @Inject
+ public TestRunner(TestRunnerConfig config) {
+ this(config.artifactsPath(),
+ vespaHome.resolve("tmp/test"),
+ vespaHome.resolve("logs/vespa/maven.log"),
+ vespaHome.resolve("tmp/config.json"),
+ vespaHome.resolve("tmp/settings.xml"),
+ profile -> { // Anything to make this testable! >_<
+ String[] command = new String[]{
+ "mvn",
+ "test",
+
+ "--batch-mode", // Run in non-interactive (batch) mode (disables output color)
+ "--show-version", // Display version information WITHOUT stopping build
+ "--settings", // Need to override repository settings in ymaven config >_<
+ vespaHome.resolve("tmp/settings.xml").toString(),
+
+ // Disable maven download progress indication
+ "-Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn",
+ "-Dstyle.color=always", // Enable ANSI color codes again
+ "-DfailIfNoTests=" + profile.failIfNoTests(),
+ "-Dvespa.test.config=" + vespaHome.resolve("tmp/config.json"),
+ "-Dvespa.test.credentials.root=" + Defaults.getDefaults().vespaHome() + "/var/vespa/sia",
+ String.format("-DargLine=-Xms%1$dm -Xmx%1$dm", config.surefireMemoryMb())
+ };
+ ProcessBuilder builder = new ProcessBuilder(command);
+ builder.environment().merge("MAVEN_OPTS", " -Djansi.force=true", String::concat);
+ builder.directory(vespaHome.resolve("tmp/test").toFile());
+ builder.redirectErrorStream(true);
+ return builder;
+ });
+ }
+
+ TestRunner(Path artifactsPath, Path testPath, Path logFile, Path configFile, Path settingsFile, Function<TestProfile, ProcessBuilder> testBuilder) {
+ this.artifactsPath = artifactsPath;
+ this.testPath = testPath;
+ this.logFile = logFile;
+ this.configFile = configFile;
+ this.settingsFile = settingsFile;
+ this.testBuilder = testBuilder;
+ }
+
+ public synchronized void test(TestProfile testProfile, byte[] testConfig) {
+ if (status == Status.RUNNING)
+ throw new IllegalArgumentException("Tests are already running; should not receive this request now.");
+
+ log.clear();
+ status = Status.RUNNING;
+
+ new Thread(() -> runTests(testProfile, testConfig)).start();
+ }
+
+ public Collection<LogRecord> getLog(long after) {
+ return log.tailMap(after + 1).values();
+ }
+
+ public synchronized Status getStatus() {
+ return status;
+ }
+
+ private void runTests(TestProfile testProfile, byte[] testConfig) {
+ ProcessBuilder builder = testBuilder.apply(testProfile);
+ {
+ LogRecord record = new LogRecord(Level.INFO,
+ String.format("Starting %s. Artifacts directory: %s Config file: %s\nCommand to run: %s",
+ testProfile.name(), artifactsPath, configFile, String.join(" ", builder.command())));
+ log.put(record.getSequenceNumber(), record);
+ logger.log(record);
+ }
+
+ boolean success;
+ // The AnsiOutputStream filters out ANSI characters, leaving the file contents pure.
+ try (PrintStream fileStream = new PrintStream(new AnsiOutputStream(new BufferedOutputStream(new FileOutputStream(logFile.toFile()))));
+ ByteArrayOutputStream logBuffer = new ByteArrayOutputStream();
+ PrintStream logFormatter = new PrintStream(new HtmlAnsiOutputStream(logBuffer))){
+ writeTestApplicationPom(testProfile);
+ Files.write(configFile, testConfig);
+ Files.write(settingsFile, settingsXml.getBytes());
+
+ Process mavenProcess = builder.start();
+ BufferedReader in = new BufferedReader(new InputStreamReader(mavenProcess.getInputStream()));
+ in.lines().forEach(line -> {
+ fileStream.println(line);
+ logFormatter.print(line);
+ LogRecord record = new LogRecord(HTML, logBuffer.toString());
+ log.put(record.getSequenceNumber(), record);
+ logBuffer.reset();
+ });
+ success = mavenProcess.waitFor() == 0;
+ }
+ catch (Exception exception) {
+ LogRecord record = new LogRecord(ERROR, "Failed to execute maven command: " + String.join(" ", builder.command()));
+ record.setThrown(exception);
+ logger.log(record);
+ log.put(record.getSequenceNumber(), record);
+ try (PrintStream file = new PrintStream(new FileOutputStream(logFile.toFile(), true))) {
+ file.println(record.getMessage());
+ exception.printStackTrace(file);
+ }
+ catch (IOException ignored) { }
+ status = Status.ERROR;
+ return;
+ }
+ status = success ? Status.SUCCESS : Status.FAILURE;
+ }
+
+ private void writeTestApplicationPom(TestProfile testProfile) throws IOException {
+ List<Path> files = listFiles(artifactsPath);
+ Path testJar = files.stream().filter(file -> file.toString().endsWith("tests.jar")).findFirst()
+ .orElseThrow(() -> new IllegalStateException("No file ending with 'tests.jar' found under '" + artifactsPath + "'!"));
+ String pomXml = PomXmlGenerator.generatePomXml(testProfile, files, testJar);
+ testPath.toFile().mkdirs();
+ Files.write(testPath.resolve("pom.xml"), pomXml.getBytes());
+ }
+
+ private static List<Path> listFiles(Path directory) {
+ try (Stream<Path> element = Files.walk(directory)) {
+ return element
+ .filter(Files::isRegularFile)
+ .filter(path -> path.toString().endsWith(".jar"))
+ .collect(Collectors.toList());
+ } catch (IOException e) {
+ throw new UncheckedIOException("Failed to list files under " + directory, e);
+ }
+ }
+
+
+ public enum Status {
+ NOT_STARTED, RUNNING, FAILURE, ERROR, SUCCESS
+ }
+
+}
diff --git a/vespa-testrunner-components/src/main/java/com/yahoo/vespa/hosted/testrunner/TestRunnerHandler.java b/vespa-testrunner-components/src/main/java/com/yahoo/vespa/hosted/testrunner/TestRunnerHandler.java
new file mode 100644
index 00000000000..d3393ce8dbe
--- /dev/null
+++ b/vespa-testrunner-components/src/main/java/com/yahoo/vespa/hosted/testrunner/TestRunnerHandler.java
@@ -0,0 +1,166 @@
+package com.yahoo.vespa.hosted.testrunner;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+import com.google.inject.Inject;
+import com.yahoo.container.jdisc.HttpRequest;
+import com.yahoo.container.jdisc.HttpResponse;
+import com.yahoo.container.jdisc.LoggingRequestHandler;
+import com.yahoo.container.logging.AccessLog;
+import com.yahoo.io.IOUtils;
+import com.yahoo.log.LogLevel;
+import com.yahoo.slime.Cursor;
+import com.yahoo.slime.JsonFormat;
+import com.yahoo.slime.Slime;
+import com.yahoo.yolean.Exceptions;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PrintStream;
+import java.util.Collection;
+import java.util.concurrent.Executor;
+import java.util.logging.Level;
+import java.util.logging.LogRecord;
+
+import static com.yahoo.jdisc.Response.Status;
+
+/**
+ * @author valerijf
+ * @author jvenstad
+ */
+public class TestRunnerHandler extends LoggingRequestHandler {
+
+ private static final String CONTENT_TYPE_APPLICATION_JSON = "application/json";
+
+ private final TestRunner testRunner;
+
+ @Inject
+ public TestRunnerHandler(Executor executor, AccessLog accessLog, TestRunner testRunner) {
+ super(executor, accessLog);
+ this.testRunner = testRunner;
+ }
+
+ @Override
+ public HttpResponse handle(HttpRequest request) {
+ try {
+ switch (request.getMethod()) {
+ case GET: return handleGET(request);
+ case POST: return handlePOST(request);
+
+ default: return new Response(Status.METHOD_NOT_ALLOWED, "Method '" + request.getMethod() + "' is not supported");
+ }
+ } catch (IllegalArgumentException e) {
+ return new Response(Status.BAD_REQUEST, Exceptions.toMessageString(e));
+ } catch (Exception e) {
+ log.log(Level.WARNING, "Unexpected error handling '" + request.getUri() + "'", e);
+ return new Response(Status.INTERNAL_SERVER_ERROR, Exceptions.toMessageString(e));
+ }
+ }
+
+ private HttpResponse handleGET(HttpRequest request) {
+ String path = request.getUri().getPath();
+ if (path.equals("/tester/v1/log")) {
+ return new SlimeJsonResponse(toSlime(testRunner.getLog(request.hasProperty("after")
+ ? Long.parseLong(request.getProperty("after"))
+ : -1)));
+ } else if (path.equals("/tester/v1/status")) {
+ log.info("Responding with status " + testRunner.getStatus());
+ return new Response(testRunner.getStatus().name());
+ }
+ return new Response(Status.NOT_FOUND, "Not found: " + request.getUri().getPath());
+ }
+
+ private HttpResponse handlePOST(HttpRequest request) throws IOException, InterruptedException {
+ final String path = request.getUri().getPath();
+ if (path.startsWith("/tester/v1/run/")) {
+ String type = lastElement(path);
+ TestProfile testProfile = TestProfile.valueOf(type.toUpperCase() + "_TEST");
+ byte[] config = IOUtils.readBytes(request.getData(), 1 << 16);
+ testRunner.test(testProfile, config);
+ log.info("Started tests of type " + type + " and status is " + testRunner.getStatus());
+ return new Response("Successfully started " + type + " tests");
+ }
+ return new Response(Status.NOT_FOUND, "Not found: " + request.getUri().getPath());
+ }
+
+ private static String lastElement(String path) {
+ if (path.endsWith("/"))
+ path = path.substring(0, path.length()-1);
+ int lastSlash = path.lastIndexOf("/");
+ if (lastSlash < 0) return path;
+ return path.substring(lastSlash + 1, path.length());
+ }
+
+ static Slime toSlime(Collection<LogRecord> log) {
+ Slime root = new Slime();
+ Cursor recordArray = root.setArray();
+ log.forEach(record -> {
+ Cursor recordObject = recordArray.addObject();
+ recordObject.setLong("id", record.getSequenceNumber());
+ recordObject.setLong("at", record.getMillis());
+ recordObject.setString("type", typeOf(record.getLevel()));
+ String message = record.getMessage();
+ if (record.getThrown() != null) {
+ ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+ record.getThrown().printStackTrace(new PrintStream(buffer));
+ message += "\n" + buffer;
+ }
+ recordObject.setString("message", message);
+ });
+ return root;
+ }
+
+ public static String typeOf(Level level) {
+ return level.getName().equals("html") ? "html"
+ : level.intValue() < LogLevel.INFO.intValue() ? "debug"
+ : level.intValue() < LogLevel.WARNING.intValue() ? "info"
+ : level.intValue() < LogLevel.ERROR.intValue() ? "warning"
+ : "error";
+ }
+
+ private class SlimeJsonResponse extends HttpResponse {
+ private final Slime slime;
+
+ private SlimeJsonResponse(Slime slime) {
+ super(200);
+ this.slime = slime;
+ }
+
+ @Override
+ public void render(OutputStream outputStream) throws IOException {
+ new JsonFormat(true).encode(outputStream, slime);
+ }
+
+ @Override
+ public String getContentType() {
+ return CONTENT_TYPE_APPLICATION_JSON;
+ }
+ }
+
+ private static class Response extends HttpResponse {
+ private static final ObjectMapper objectMapper = new ObjectMapper();
+ private final String message;
+
+ private Response(String response) {
+ this(200, response);
+ }
+
+ private Response(int statusCode, String message) {
+ super(statusCode);
+ this.message = message;
+ }
+
+ @Override
+ public void render(OutputStream outputStream) throws IOException {
+ ObjectNode objectNode = objectMapper.createObjectNode();
+ objectNode.put("message", message);
+ objectMapper.writeValue(outputStream, objectNode);
+ }
+
+ @Override
+ public String getContentType() {
+ return CONTENT_TYPE_APPLICATION_JSON;
+ }
+ }
+}
diff --git a/vespa-testrunner-components/src/main/resources/configdefinitions/test-runner.def b/vespa-testrunner-components/src/main/resources/configdefinitions/test-runner.def
new file mode 100644
index 00000000000..a2d0eacd9be
--- /dev/null
+++ b/vespa-testrunner-components/src/main/resources/configdefinitions/test-runner.def
@@ -0,0 +1,4 @@
+package=com.yahoo.vespa.hosted.testrunner
+
+artifactsPath path
+surefireMemoryMb int
diff --git a/vespa-testrunner-components/src/test/java/com/yahoo/vespa/hosted/testrunner/PomXmlGeneratorTest.java b/vespa-testrunner-components/src/test/java/com/yahoo/vespa/hosted/testrunner/PomXmlGeneratorTest.java
new file mode 100644
index 00000000000..dce02922c63
--- /dev/null
+++ b/vespa-testrunner-components/src/test/java/com/yahoo/vespa/hosted/testrunner/PomXmlGeneratorTest.java
@@ -0,0 +1,33 @@
+package com.yahoo.vespa.hosted.testrunner;
+
+import org.apache.commons.io.IOUtils;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author valerijf
+ */
+public class PomXmlGeneratorTest {
+
+ @Test
+ public void write_system_tests_pom_xml() throws IOException {
+ List<Path> artifacts = Arrays.asList(
+ Paths.get("components/my-comp.jar"),
+ Paths.get("main.jar"));
+
+ String actual = PomXmlGenerator.generatePomXml(TestProfile.SYSTEM_TEST, artifacts, artifacts.get(1));
+ assertFile("/pom.xml_system_tests", actual);
+ }
+
+ private void assertFile(String resourceFile, String actual) throws IOException {
+ String expected = IOUtils.toString(this.getClass().getResourceAsStream(resourceFile));
+ assertEquals(resourceFile, expected, actual);
+ }
+} \ No newline at end of file
diff --git a/vespa-testrunner-components/src/test/java/com/yahoo/vespa/hosted/testrunner/TestRunnerHandlerTest.java b/vespa-testrunner-components/src/test/java/com/yahoo/vespa/hosted/testrunner/TestRunnerHandlerTest.java
new file mode 100644
index 00000000000..a91b1308080
--- /dev/null
+++ b/vespa-testrunner-components/src/test/java/com/yahoo/vespa/hosted/testrunner/TestRunnerHandlerTest.java
@@ -0,0 +1,37 @@
+package com.yahoo.vespa.hosted.testrunner;
+
+import com.yahoo.vespa.config.SlimeUtils;
+import org.junit.Test;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.time.Instant;
+import java.util.Collections;
+import java.util.logging.Level;
+import java.util.logging.LogRecord;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author jvenstad
+ */
+public class TestRunnerHandlerTest {
+
+ @Test
+ public void logSerialization() throws IOException {
+ LogRecord record = new LogRecord(Level.INFO, "Hello.");
+ record.setSequenceNumber(1);
+ record.setInstant(Instant.ofEpochMilli(2));
+ Exception exception = new RuntimeException();
+ record.setThrown(exception);
+ ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+ exception.printStackTrace(new PrintStream(buffer));
+ String trace = buffer.toString()
+ .replaceAll("\n", "\\\\n")
+ .replaceAll("\t", "\\\\t");
+ assertEquals("[{\"id\":1,\"at\":2,\"type\":\"info\",\"message\":\"Hello.\\n" + trace + "\"}]",
+ new String(SlimeUtils.toJsonBytes(TestRunnerHandler.toSlime(Collections.singletonList(record)))));
+ }
+
+}
diff --git a/vespa-testrunner-components/src/test/java/com/yahoo/vespa/hosted/testrunner/TestRunnerTest.java b/vespa-testrunner-components/src/test/java/com/yahoo/vespa/hosted/testrunner/TestRunnerTest.java
new file mode 100644
index 00000000000..49c95fa4b6f
--- /dev/null
+++ b/vespa-testrunner-components/src/test/java/com/yahoo/vespa/hosted/testrunner/TestRunnerTest.java
@@ -0,0 +1,127 @@
+package com.yahoo.vespa.hosted.testrunner;
+
+import org.fusesource.jansi.Ansi;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Iterator;
+import java.util.logging.LogRecord;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+/**
+ * Unit tests relying on a UNIX shell >_<
+ *
+ * @author jvenstad
+ */
+public class TestRunnerTest {
+
+ @Rule
+ public TemporaryFolder tmp = new TemporaryFolder();
+
+ private Path artifactsPath;
+ private Path testPath;
+ private Path logFile;
+ private Path configFile;
+ private Path settingsFile;
+
+ @Before
+ public void setup() throws IOException {
+ artifactsPath = tmp.newFolder("artifacts").toPath();
+ Files.createFile(artifactsPath.resolve("my-tests.jar"));
+ Files.createFile(artifactsPath.resolve("my-fat-test.jar"));
+ testPath = tmp.newFolder("testData").toPath();
+ logFile = tmp.newFile("maven.log").toPath();
+ configFile = tmp.newFile("testConfig.json").toPath();
+ settingsFile = tmp.newFile("settings.xml").toPath();
+ }
+
+ @Test
+ public void ansiCodesAreConvertedToHtml() throws InterruptedException {
+ TestRunner runner = new TestRunner(artifactsPath, testPath, logFile, configFile, settingsFile,
+ __ -> new ProcessBuilder("echo", Ansi.ansi().fg(Ansi.Color.RED).a("Hello!").reset().toString()));
+ runner.test(TestProfile.SYSTEM_TEST, new byte[0]);
+ while (runner.getStatus() == TestRunner.Status.RUNNING) {
+ Thread.sleep(10);
+ }
+ Iterator<LogRecord> log = runner.getLog(-1).iterator();
+ log.next();
+ LogRecord record = log.next();
+ assertEquals("<span style=\"color: red;\">Hello!</span>", record.getMessage());
+ assertEquals(0, runner.getLog(record.getSequenceNumber()).size());
+ assertEquals(TestRunner.Status.SUCCESS, runner.getStatus());
+ }
+
+ @Test
+ public void errorLeadsToError() throws InterruptedException {
+ TestRunner runner = new TestRunner(artifactsPath, testPath, logFile, configFile, settingsFile,
+ __ -> new ProcessBuilder("This is a command that doesn't exist, for sure!"));
+ runner.test(TestProfile.SYSTEM_TEST, new byte[0]);
+ while (runner.getStatus() == TestRunner.Status.RUNNING) {
+ Thread.sleep(10);
+ }
+ Iterator<LogRecord> log = runner.getLog(-1).iterator();
+ log.next();
+ LogRecord record = log.next();
+ assertEquals("Failed to execute maven command: This is a command that doesn't exist, for sure!", record.getMessage());
+ assertNotNull(record.getThrown());
+ assertEquals(TestRunner.Status.ERROR, runner.getStatus());
+ }
+
+ @Test
+ public void failureLeadsToFailure() throws InterruptedException {
+ TestRunner runner = new TestRunner(artifactsPath, testPath, logFile, configFile, settingsFile,
+ __ -> new ProcessBuilder("false"));
+ runner.test(TestProfile.SYSTEM_TEST, new byte[0]);
+ while (runner.getStatus() == TestRunner.Status.RUNNING) {
+ Thread.sleep(10);
+ }
+ assertEquals(1, runner.getLog(-1).size());
+ assertEquals(TestRunner.Status.FAILURE, runner.getStatus());
+ }
+
+ @Test
+ public void filesAreGenerated() throws InterruptedException, IOException {
+ TestRunner runner = new TestRunner(artifactsPath, testPath, logFile, configFile, settingsFile,
+ __ -> new ProcessBuilder("echo", "Hello!"));
+ runner.test(TestProfile.SYSTEM_TEST, "config".getBytes());
+ while (runner.getStatus() == TestRunner.Status.RUNNING) {
+ Thread.sleep(10);
+ }
+ assertEquals("config", new String(Files.readAllBytes(configFile)));
+ assertTrue(Files.exists(testPath.resolve("pom.xml")));
+ assertTrue(Files.exists(settingsFile));
+ assertEquals("Hello!\n", new String(Files.readAllBytes(logFile)));
+ }
+
+ @Test
+ public void runnerCanBeReused() throws InterruptedException, IOException {
+ TestRunner runner = new TestRunner(artifactsPath, testPath, logFile, configFile, settingsFile,
+ __ -> new ProcessBuilder("sleep", "0.1"));
+ runner.test(TestProfile.SYSTEM_TEST, "config".getBytes());
+ assertEquals(TestRunner.Status.RUNNING, runner.getStatus());
+
+ while (runner.getStatus() == TestRunner.Status.RUNNING) {
+ Thread.sleep(10);
+ }
+ assertEquals(1, runner.getLog(-1).size());
+ assertEquals(TestRunner.Status.SUCCESS, runner.getStatus());
+
+ runner.test(TestProfile.STAGING_TEST, "newConfig".getBytes());
+ while (runner.getStatus() == TestRunner.Status.RUNNING) {
+ Thread.sleep(10);
+ }
+
+ assertEquals("newConfig", new String(Files.readAllBytes(configFile)));
+ assertEquals(1, runner.getLog(-1).size());
+ }
+
+}
diff --git a/vespa-testrunner-components/src/test/resources/pom.xml_system_tests b/vespa-testrunner-components/src/test/resources/pom.xml_system_tests
new file mode 100644
index 00000000000..22382b84316
--- /dev/null
+++ b/vespa-testrunner-components/src/test/resources/pom.xml_system_tests
@@ -0,0 +1,72 @@
+<?xml version="1.0"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>tester-application</artifactId>
+ <version>1.0.0</version>
+
+ <properties>
+ <maven_version>4.12</maven_version>
+ <surefire_version>2.22.0</surefire_version>
+ <my-comp.jar.path>components/my-comp.jar</my-comp.jar.path>
+ <main.jar.path>main.jar</main.jar.path>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>${maven_version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.yahoo.vespa.testrunner.test</groupId>
+ <artifactId>my-comp.jar</artifactId>
+ <scope>system</scope>
+ <type>test-jar</type>
+ <version>test</version>
+ <systemPath>${my-comp.jar.path}</systemPath>
+ </dependency>
+ <dependency>
+ <groupId>com.yahoo.vespa.testrunner.test</groupId>
+ <artifactId>main.jar</artifactId>
+ <scope>system</scope>
+ <type>test-jar</type>
+ <version>test</version>
+ <systemPath>${main.jar.path}</systemPath>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <version>${surefire_version}</version>
+ <configuration>
+ <dependenciesToScan>
+ <dependency>com.yahoo.vespa.testrunner.test:main.jar</dependency>
+ </dependenciesToScan>
+ <groups>com.yahoo.vespa.tenant.cd.SystemTest, com.yahoo.vespa.tenant.systemtest.base.SystemTest</groups>
+ <excludedGroups>com.yahoo.vespa.tenant.systemtest.base.impl.EmptyExcludeGroup.class</excludedGroups>
+ <excludes>
+ <exclude>com.yahoo.vespa.tenant.cd.SystemTest, com.yahoo.vespa.tenant.systemtest.base.SystemTest</exclude>
+ </excludes>
+ <reportsDirectory>${env.TEST_DIR}</reportsDirectory>
+ <redirectTestOutputToFile>false</redirectTestOutputToFile>
+ <environmentVariables>
+ <LD_LIBRARY_PATH>/opt/vespa/lib64</LD_LIBRARY_PATH>
+ </environmentVariables>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-report-plugin</artifactId>
+ <version>${surefire_version}</version>
+ <configuration>
+ <reportsDirectory>${env.TEST_DIR}</reportsDirectory>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+</project>
diff --git a/vespabase/src/rhel-prestart.sh b/vespabase/src/rhel-prestart.sh
index 081d7df18a4..fe067868c04 100755
--- a/vespabase/src/rhel-prestart.sh
+++ b/vespabase/src/rhel-prestart.sh
@@ -113,6 +113,7 @@ fixdir ${VESPA_USER} wheel 755 var/db/vespa/logcontrol
fixdir ${VESPA_USER} wheel 755 var/db/vespa/search
fixdir ${VESPA_USER} wheel 755 var/jdisc_core
fixdir ${VESPA_USER} wheel 755 var/vespa
+fixdir ${VESPA_USER} wheel 755 var/vespa/application
fixdir ${VESPA_USER} wheel 755 var/vespa/bundlecache
fixdir ${VESPA_USER} wheel 755 var/vespa/bundlecache/configserver
fixdir ${VESPA_USER} wheel 755 var/vespa/cache/config/
diff --git a/vespalog/src/vespa/log/log_message.cpp b/vespalog/src/vespa/log/log_message.cpp
index ec734747dbc..77f9b619e9f 100644
--- a/vespalog/src/vespa/log/log_message.cpp
+++ b/vespalog/src/vespa/log/log_message.cpp
@@ -90,6 +90,25 @@ LogMessage::LogMessage()
{
}
+LogMessage::LogMessage(int64_t time_nanos_in,
+ const std::string& hostname_in,
+ int32_t process_id_in,
+ int32_t thread_id_in,
+ const std::string& service_in,
+ const std::string& component_in,
+ Logger::LogLevel level_in,
+ const std::string& payload_in)
+ : _time_nanos(time_nanos_in),
+ _hostname(hostname_in),
+ _process_id(process_id_in),
+ _thread_id(thread_id_in),
+ _service(service_in),
+ _component(component_in),
+ _level(level_in),
+ _payload(payload_in)
+{
+}
+
LogMessage::~LogMessage() = default;
diff --git a/vespalog/src/vespa/log/log_message.h b/vespalog/src/vespa/log/log_message.h
index 1ca052c9e7d..832b5f6d47d 100644
--- a/vespalog/src/vespa/log/log_message.h
+++ b/vespalog/src/vespa/log/log_message.h
@@ -22,6 +22,14 @@ class LogMessage {
public:
LogMessage();
+ LogMessage(int64_t time_nanos_in,
+ const std::string& hostname_in,
+ int32_t process_id_in,
+ int32_t thread_id_in,
+ const std::string& service_in,
+ const std::string& component_in,
+ Logger::LogLevel level_in,
+ const std::string& payload_in);
~LogMessage();
void parse_log_line(std::string_view log_line);
int64_t time_nanos() const { return _time_nanos; }